These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / xfs / libxfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_inode.h"
27 #include "xfs_trans.h"
28 #include "xfs_inode_item.h"
29 #include "xfs_buf_item.h"
30 #include "xfs_btree.h"
31 #include "xfs_error.h"
32 #include "xfs_trace.h"
33 #include "xfs_cksum.h"
34 #include "xfs_alloc.h"
35 #include "xfs_log.h"
36
37 /*
38  * Cursor allocation zone.
39  */
40 kmem_zone_t     *xfs_btree_cur_zone;
41
42 /*
43  * Btree magic numbers.
44  */
45 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
46         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
47           XFS_FIBT_MAGIC },
48         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
49           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
50 };
51 #define xfs_btree_magic(cur) \
52         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
53
54
55 STATIC int                              /* error (0 or EFSCORRUPTED) */
56 xfs_btree_check_lblock(
57         struct xfs_btree_cur    *cur,   /* btree cursor */
58         struct xfs_btree_block  *block, /* btree long form block pointer */
59         int                     level,  /* level of the btree block */
60         struct xfs_buf          *bp)    /* buffer for block, if any */
61 {
62         int                     lblock_ok = 1; /* block passes checks */
63         struct xfs_mount        *mp;    /* file system mount point */
64
65         mp = cur->bc_mp;
66
67         if (xfs_sb_version_hascrc(&mp->m_sb)) {
68                 lblock_ok = lblock_ok &&
69                         uuid_equal(&block->bb_u.l.bb_uuid,
70                                    &mp->m_sb.sb_meta_uuid) &&
71                         block->bb_u.l.bb_blkno == cpu_to_be64(
72                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
73         }
74
75         lblock_ok = lblock_ok &&
76                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
77                 be16_to_cpu(block->bb_level) == level &&
78                 be16_to_cpu(block->bb_numrecs) <=
79                         cur->bc_ops->get_maxrecs(cur, level) &&
80                 block->bb_u.l.bb_leftsib &&
81                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
82                  XFS_FSB_SANITY_CHECK(mp,
83                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
84                 block->bb_u.l.bb_rightsib &&
85                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
86                  XFS_FSB_SANITY_CHECK(mp,
87                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
88
89         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
90                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
91                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
92                 if (bp)
93                         trace_xfs_btree_corrupt(bp, _RET_IP_);
94                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
95                 return -EFSCORRUPTED;
96         }
97         return 0;
98 }
99
100 STATIC int                              /* error (0 or EFSCORRUPTED) */
101 xfs_btree_check_sblock(
102         struct xfs_btree_cur    *cur,   /* btree cursor */
103         struct xfs_btree_block  *block, /* btree short form block pointer */
104         int                     level,  /* level of the btree block */
105         struct xfs_buf          *bp)    /* buffer containing block */
106 {
107         struct xfs_mount        *mp;    /* file system mount point */
108         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
109         struct xfs_agf          *agf;   /* ag. freespace structure */
110         xfs_agblock_t           agflen; /* native ag. freespace length */
111         int                     sblock_ok = 1; /* block passes checks */
112
113         mp = cur->bc_mp;
114         agbp = cur->bc_private.a.agbp;
115         agf = XFS_BUF_TO_AGF(agbp);
116         agflen = be32_to_cpu(agf->agf_length);
117
118         if (xfs_sb_version_hascrc(&mp->m_sb)) {
119                 sblock_ok = sblock_ok &&
120                         uuid_equal(&block->bb_u.s.bb_uuid,
121                                    &mp->m_sb.sb_meta_uuid) &&
122                         block->bb_u.s.bb_blkno == cpu_to_be64(
123                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
124         }
125
126         sblock_ok = sblock_ok &&
127                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
128                 be16_to_cpu(block->bb_level) == level &&
129                 be16_to_cpu(block->bb_numrecs) <=
130                         cur->bc_ops->get_maxrecs(cur, level) &&
131                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
132                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
133                 block->bb_u.s.bb_leftsib &&
134                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
135                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
136                 block->bb_u.s.bb_rightsib;
137
138         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
139                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
140                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
141                 if (bp)
142                         trace_xfs_btree_corrupt(bp, _RET_IP_);
143                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
144                 return -EFSCORRUPTED;
145         }
146         return 0;
147 }
148
149 /*
150  * Debug routine: check that block header is ok.
151  */
152 int
153 xfs_btree_check_block(
154         struct xfs_btree_cur    *cur,   /* btree cursor */
155         struct xfs_btree_block  *block, /* generic btree block pointer */
156         int                     level,  /* level of the btree block */
157         struct xfs_buf          *bp)    /* buffer containing block, if any */
158 {
159         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
160                 return xfs_btree_check_lblock(cur, block, level, bp);
161         else
162                 return xfs_btree_check_sblock(cur, block, level, bp);
163 }
164
165 /*
166  * Check that (long) pointer is ok.
167  */
168 int                                     /* error (0 or EFSCORRUPTED) */
169 xfs_btree_check_lptr(
170         struct xfs_btree_cur    *cur,   /* btree cursor */
171         xfs_fsblock_t           bno,    /* btree block disk address */
172         int                     level)  /* btree block level */
173 {
174         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
175                 level > 0 &&
176                 bno != NULLFSBLOCK &&
177                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
178         return 0;
179 }
180
181 #ifdef DEBUG
182 /*
183  * Check that (short) pointer is ok.
184  */
185 STATIC int                              /* error (0 or EFSCORRUPTED) */
186 xfs_btree_check_sptr(
187         struct xfs_btree_cur    *cur,   /* btree cursor */
188         xfs_agblock_t           bno,    /* btree block disk address */
189         int                     level)  /* btree block level */
190 {
191         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
192
193         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
194                 level > 0 &&
195                 bno != NULLAGBLOCK &&
196                 bno != 0 &&
197                 bno < agblocks);
198         return 0;
199 }
200
201 /*
202  * Check that block ptr is ok.
203  */
204 STATIC int                              /* error (0 or EFSCORRUPTED) */
205 xfs_btree_check_ptr(
206         struct xfs_btree_cur    *cur,   /* btree cursor */
207         union xfs_btree_ptr     *ptr,   /* btree block disk address */
208         int                     index,  /* offset from ptr to check */
209         int                     level)  /* btree block level */
210 {
211         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
212                 return xfs_btree_check_lptr(cur,
213                                 be64_to_cpu((&ptr->l)[index]), level);
214         } else {
215                 return xfs_btree_check_sptr(cur,
216                                 be32_to_cpu((&ptr->s)[index]), level);
217         }
218 }
219 #endif
220
221 /*
222  * Calculate CRC on the whole btree block and stuff it into the
223  * long-form btree header.
224  *
225  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
226  * it into the buffer so recovery knows what the last modification was that made
227  * it to disk.
228  */
229 void
230 xfs_btree_lblock_calc_crc(
231         struct xfs_buf          *bp)
232 {
233         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
234         struct xfs_buf_log_item *bip = bp->b_fspriv;
235
236         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
237                 return;
238         if (bip)
239                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
240         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
241 }
242
243 bool
244 xfs_btree_lblock_verify_crc(
245         struct xfs_buf          *bp)
246 {
247         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
248         struct xfs_mount        *mp = bp->b_target->bt_mount;
249
250         if (xfs_sb_version_hascrc(&mp->m_sb)) {
251                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
252                         return false;
253                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
254         }
255
256         return true;
257 }
258
259 /*
260  * Calculate CRC on the whole btree block and stuff it into the
261  * short-form btree header.
262  *
263  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
264  * it into the buffer so recovery knows what the last modification was that made
265  * it to disk.
266  */
267 void
268 xfs_btree_sblock_calc_crc(
269         struct xfs_buf          *bp)
270 {
271         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
272         struct xfs_buf_log_item *bip = bp->b_fspriv;
273
274         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
275                 return;
276         if (bip)
277                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
278         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
279 }
280
281 bool
282 xfs_btree_sblock_verify_crc(
283         struct xfs_buf          *bp)
284 {
285         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
286         struct xfs_mount        *mp = bp->b_target->bt_mount;
287
288         if (xfs_sb_version_hascrc(&mp->m_sb)) {
289                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
290                         return false;
291                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
292         }
293
294         return true;
295 }
296
297 /*
298  * Delete the btree cursor.
299  */
300 void
301 xfs_btree_del_cursor(
302         xfs_btree_cur_t *cur,           /* btree cursor */
303         int             error)          /* del because of error */
304 {
305         int             i;              /* btree level */
306
307         /*
308          * Clear the buffer pointers, and release the buffers.
309          * If we're doing this in the face of an error, we
310          * need to make sure to inspect all of the entries
311          * in the bc_bufs array for buffers to be unlocked.
312          * This is because some of the btree code works from
313          * level n down to 0, and if we get an error along
314          * the way we won't have initialized all the entries
315          * down to 0.
316          */
317         for (i = 0; i < cur->bc_nlevels; i++) {
318                 if (cur->bc_bufs[i])
319                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
320                 else if (!error)
321                         break;
322         }
323         /*
324          * Can't free a bmap cursor without having dealt with the
325          * allocated indirect blocks' accounting.
326          */
327         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
328                cur->bc_private.b.allocated == 0);
329         /*
330          * Free the cursor.
331          */
332         kmem_zone_free(xfs_btree_cur_zone, cur);
333 }
334
335 /*
336  * Duplicate the btree cursor.
337  * Allocate a new one, copy the record, re-get the buffers.
338  */
339 int                                     /* error */
340 xfs_btree_dup_cursor(
341         xfs_btree_cur_t *cur,           /* input cursor */
342         xfs_btree_cur_t **ncur)         /* output cursor */
343 {
344         xfs_buf_t       *bp;            /* btree block's buffer pointer */
345         int             error;          /* error return value */
346         int             i;              /* level number of btree block */
347         xfs_mount_t     *mp;            /* mount structure for filesystem */
348         xfs_btree_cur_t *new;           /* new cursor value */
349         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
350
351         tp = cur->bc_tp;
352         mp = cur->bc_mp;
353
354         /*
355          * Allocate a new cursor like the old one.
356          */
357         new = cur->bc_ops->dup_cursor(cur);
358
359         /*
360          * Copy the record currently in the cursor.
361          */
362         new->bc_rec = cur->bc_rec;
363
364         /*
365          * For each level current, re-get the buffer and copy the ptr value.
366          */
367         for (i = 0; i < new->bc_nlevels; i++) {
368                 new->bc_ptrs[i] = cur->bc_ptrs[i];
369                 new->bc_ra[i] = cur->bc_ra[i];
370                 bp = cur->bc_bufs[i];
371                 if (bp) {
372                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
373                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
374                                                    0, &bp,
375                                                    cur->bc_ops->buf_ops);
376                         if (error) {
377                                 xfs_btree_del_cursor(new, error);
378                                 *ncur = NULL;
379                                 return error;
380                         }
381                 }
382                 new->bc_bufs[i] = bp;
383         }
384         *ncur = new;
385         return 0;
386 }
387
388 /*
389  * XFS btree block layout and addressing:
390  *
391  * There are two types of blocks in the btree: leaf and non-leaf blocks.
392  *
393  * The leaf record start with a header then followed by records containing
394  * the values.  A non-leaf block also starts with the same header, and
395  * then first contains lookup keys followed by an equal number of pointers
396  * to the btree blocks at the previous level.
397  *
398  *              +--------+-------+-------+-------+-------+-------+-------+
399  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
400  *              +--------+-------+-------+-------+-------+-------+-------+
401  *
402  *              +--------+-------+-------+-------+-------+-------+-------+
403  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
404  *              +--------+-------+-------+-------+-------+-------+-------+
405  *
406  * The header is called struct xfs_btree_block for reasons better left unknown
407  * and comes in different versions for short (32bit) and long (64bit) block
408  * pointers.  The record and key structures are defined by the btree instances
409  * and opaque to the btree core.  The block pointers are simple disk endian
410  * integers, available in a short (32bit) and long (64bit) variant.
411  *
412  * The helpers below calculate the offset of a given record, key or pointer
413  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
414  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
415  * inside the btree block is done using indices starting at one, not zero!
416  */
417
418 /*
419  * Return size of the btree block header for this btree instance.
420  */
421 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
422 {
423         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
424                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
425                         return XFS_BTREE_LBLOCK_CRC_LEN;
426                 return XFS_BTREE_LBLOCK_LEN;
427         }
428         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
429                 return XFS_BTREE_SBLOCK_CRC_LEN;
430         return XFS_BTREE_SBLOCK_LEN;
431 }
432
433 /*
434  * Return size of btree block pointers for this btree instance.
435  */
436 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
437 {
438         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
439                 sizeof(__be64) : sizeof(__be32);
440 }
441
442 /*
443  * Calculate offset of the n-th record in a btree block.
444  */
445 STATIC size_t
446 xfs_btree_rec_offset(
447         struct xfs_btree_cur    *cur,
448         int                     n)
449 {
450         return xfs_btree_block_len(cur) +
451                 (n - 1) * cur->bc_ops->rec_len;
452 }
453
454 /*
455  * Calculate offset of the n-th key in a btree block.
456  */
457 STATIC size_t
458 xfs_btree_key_offset(
459         struct xfs_btree_cur    *cur,
460         int                     n)
461 {
462         return xfs_btree_block_len(cur) +
463                 (n - 1) * cur->bc_ops->key_len;
464 }
465
466 /*
467  * Calculate offset of the n-th block pointer in a btree block.
468  */
469 STATIC size_t
470 xfs_btree_ptr_offset(
471         struct xfs_btree_cur    *cur,
472         int                     n,
473         int                     level)
474 {
475         return xfs_btree_block_len(cur) +
476                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
477                 (n - 1) * xfs_btree_ptr_len(cur);
478 }
479
480 /*
481  * Return a pointer to the n-th record in the btree block.
482  */
483 STATIC union xfs_btree_rec *
484 xfs_btree_rec_addr(
485         struct xfs_btree_cur    *cur,
486         int                     n,
487         struct xfs_btree_block  *block)
488 {
489         return (union xfs_btree_rec *)
490                 ((char *)block + xfs_btree_rec_offset(cur, n));
491 }
492
493 /*
494  * Return a pointer to the n-th key in the btree block.
495  */
496 STATIC union xfs_btree_key *
497 xfs_btree_key_addr(
498         struct xfs_btree_cur    *cur,
499         int                     n,
500         struct xfs_btree_block  *block)
501 {
502         return (union xfs_btree_key *)
503                 ((char *)block + xfs_btree_key_offset(cur, n));
504 }
505
506 /*
507  * Return a pointer to the n-th block pointer in the btree block.
508  */
509 STATIC union xfs_btree_ptr *
510 xfs_btree_ptr_addr(
511         struct xfs_btree_cur    *cur,
512         int                     n,
513         struct xfs_btree_block  *block)
514 {
515         int                     level = xfs_btree_get_level(block);
516
517         ASSERT(block->bb_level != 0);
518
519         return (union xfs_btree_ptr *)
520                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
521 }
522
523 /*
524  * Get the root block which is stored in the inode.
525  *
526  * For now this btree implementation assumes the btree root is always
527  * stored in the if_broot field of an inode fork.
528  */
529 STATIC struct xfs_btree_block *
530 xfs_btree_get_iroot(
531        struct xfs_btree_cur    *cur)
532 {
533        struct xfs_ifork        *ifp;
534
535        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
536        return (struct xfs_btree_block *)ifp->if_broot;
537 }
538
539 /*
540  * Retrieve the block pointer from the cursor at the given level.
541  * This may be an inode btree root or from a buffer.
542  */
543 STATIC struct xfs_btree_block *         /* generic btree block pointer */
544 xfs_btree_get_block(
545         struct xfs_btree_cur    *cur,   /* btree cursor */
546         int                     level,  /* level in btree */
547         struct xfs_buf          **bpp)  /* buffer containing the block */
548 {
549         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
550             (level == cur->bc_nlevels - 1)) {
551                 *bpp = NULL;
552                 return xfs_btree_get_iroot(cur);
553         }
554
555         *bpp = cur->bc_bufs[level];
556         return XFS_BUF_TO_BLOCK(*bpp);
557 }
558
559 /*
560  * Get a buffer for the block, return it with no data read.
561  * Long-form addressing.
562  */
563 xfs_buf_t *                             /* buffer for fsbno */
564 xfs_btree_get_bufl(
565         xfs_mount_t     *mp,            /* file system mount point */
566         xfs_trans_t     *tp,            /* transaction pointer */
567         xfs_fsblock_t   fsbno,          /* file system block number */
568         uint            lock)           /* lock flags for get_buf */
569 {
570         xfs_daddr_t             d;              /* real disk block address */
571
572         ASSERT(fsbno != NULLFSBLOCK);
573         d = XFS_FSB_TO_DADDR(mp, fsbno);
574         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
575 }
576
577 /*
578  * Get a buffer for the block, return it with no data read.
579  * Short-form addressing.
580  */
581 xfs_buf_t *                             /* buffer for agno/agbno */
582 xfs_btree_get_bufs(
583         xfs_mount_t     *mp,            /* file system mount point */
584         xfs_trans_t     *tp,            /* transaction pointer */
585         xfs_agnumber_t  agno,           /* allocation group number */
586         xfs_agblock_t   agbno,          /* allocation group block number */
587         uint            lock)           /* lock flags for get_buf */
588 {
589         xfs_daddr_t             d;              /* real disk block address */
590
591         ASSERT(agno != NULLAGNUMBER);
592         ASSERT(agbno != NULLAGBLOCK);
593         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
594         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
595 }
596
597 /*
598  * Check for the cursor referring to the last block at the given level.
599  */
600 int                                     /* 1=is last block, 0=not last block */
601 xfs_btree_islastblock(
602         xfs_btree_cur_t         *cur,   /* btree cursor */
603         int                     level)  /* level to check */
604 {
605         struct xfs_btree_block  *block; /* generic btree block pointer */
606         xfs_buf_t               *bp;    /* buffer containing block */
607
608         block = xfs_btree_get_block(cur, level, &bp);
609         xfs_btree_check_block(cur, block, level, bp);
610         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
611                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
612         else
613                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
614 }
615
616 /*
617  * Change the cursor to point to the first record at the given level.
618  * Other levels are unaffected.
619  */
620 STATIC int                              /* success=1, failure=0 */
621 xfs_btree_firstrec(
622         xfs_btree_cur_t         *cur,   /* btree cursor */
623         int                     level)  /* level to change */
624 {
625         struct xfs_btree_block  *block; /* generic btree block pointer */
626         xfs_buf_t               *bp;    /* buffer containing block */
627
628         /*
629          * Get the block pointer for this level.
630          */
631         block = xfs_btree_get_block(cur, level, &bp);
632         xfs_btree_check_block(cur, block, level, bp);
633         /*
634          * It's empty, there is no such record.
635          */
636         if (!block->bb_numrecs)
637                 return 0;
638         /*
639          * Set the ptr value to 1, that's the first record/key.
640          */
641         cur->bc_ptrs[level] = 1;
642         return 1;
643 }
644
645 /*
646  * Change the cursor to point to the last record in the current block
647  * at the given level.  Other levels are unaffected.
648  */
649 STATIC int                              /* success=1, failure=0 */
650 xfs_btree_lastrec(
651         xfs_btree_cur_t         *cur,   /* btree cursor */
652         int                     level)  /* level to change */
653 {
654         struct xfs_btree_block  *block; /* generic btree block pointer */
655         xfs_buf_t               *bp;    /* buffer containing block */
656
657         /*
658          * Get the block pointer for this level.
659          */
660         block = xfs_btree_get_block(cur, level, &bp);
661         xfs_btree_check_block(cur, block, level, bp);
662         /*
663          * It's empty, there is no such record.
664          */
665         if (!block->bb_numrecs)
666                 return 0;
667         /*
668          * Set the ptr value to numrecs, that's the last record/key.
669          */
670         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
671         return 1;
672 }
673
674 /*
675  * Compute first and last byte offsets for the fields given.
676  * Interprets the offsets table, which contains struct field offsets.
677  */
678 void
679 xfs_btree_offsets(
680         __int64_t       fields,         /* bitmask of fields */
681         const short     *offsets,       /* table of field offsets */
682         int             nbits,          /* number of bits to inspect */
683         int             *first,         /* output: first byte offset */
684         int             *last)          /* output: last byte offset */
685 {
686         int             i;              /* current bit number */
687         __int64_t       imask;          /* mask for current bit number */
688
689         ASSERT(fields != 0);
690         /*
691          * Find the lowest bit, so the first byte offset.
692          */
693         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
694                 if (imask & fields) {
695                         *first = offsets[i];
696                         break;
697                 }
698         }
699         /*
700          * Find the highest bit, so the last byte offset.
701          */
702         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
703                 if (imask & fields) {
704                         *last = offsets[i + 1] - 1;
705                         break;
706                 }
707         }
708 }
709
710 /*
711  * Get a buffer for the block, return it read in.
712  * Long-form addressing.
713  */
714 int
715 xfs_btree_read_bufl(
716         struct xfs_mount        *mp,            /* file system mount point */
717         struct xfs_trans        *tp,            /* transaction pointer */
718         xfs_fsblock_t           fsbno,          /* file system block number */
719         uint                    lock,           /* lock flags for read_buf */
720         struct xfs_buf          **bpp,          /* buffer for fsbno */
721         int                     refval,         /* ref count value for buffer */
722         const struct xfs_buf_ops *ops)
723 {
724         struct xfs_buf          *bp;            /* return value */
725         xfs_daddr_t             d;              /* real disk block address */
726         int                     error;
727
728         ASSERT(fsbno != NULLFSBLOCK);
729         d = XFS_FSB_TO_DADDR(mp, fsbno);
730         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
731                                    mp->m_bsize, lock, &bp, ops);
732         if (error)
733                 return error;
734         if (bp)
735                 xfs_buf_set_ref(bp, refval);
736         *bpp = bp;
737         return 0;
738 }
739
740 /*
741  * Read-ahead the block, don't wait for it, don't return a buffer.
742  * Long-form addressing.
743  */
744 /* ARGSUSED */
745 void
746 xfs_btree_reada_bufl(
747         struct xfs_mount        *mp,            /* file system mount point */
748         xfs_fsblock_t           fsbno,          /* file system block number */
749         xfs_extlen_t            count,          /* count of filesystem blocks */
750         const struct xfs_buf_ops *ops)
751 {
752         xfs_daddr_t             d;
753
754         ASSERT(fsbno != NULLFSBLOCK);
755         d = XFS_FSB_TO_DADDR(mp, fsbno);
756         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
757 }
758
759 /*
760  * Read-ahead the block, don't wait for it, don't return a buffer.
761  * Short-form addressing.
762  */
763 /* ARGSUSED */
764 void
765 xfs_btree_reada_bufs(
766         struct xfs_mount        *mp,            /* file system mount point */
767         xfs_agnumber_t          agno,           /* allocation group number */
768         xfs_agblock_t           agbno,          /* allocation group block number */
769         xfs_extlen_t            count,          /* count of filesystem blocks */
770         const struct xfs_buf_ops *ops)
771 {
772         xfs_daddr_t             d;
773
774         ASSERT(agno != NULLAGNUMBER);
775         ASSERT(agbno != NULLAGBLOCK);
776         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
777         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
778 }
779
780 STATIC int
781 xfs_btree_readahead_lblock(
782         struct xfs_btree_cur    *cur,
783         int                     lr,
784         struct xfs_btree_block  *block)
785 {
786         int                     rval = 0;
787         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
788         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
789
790         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
791                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
792                                      cur->bc_ops->buf_ops);
793                 rval++;
794         }
795
796         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
797                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
798                                      cur->bc_ops->buf_ops);
799                 rval++;
800         }
801
802         return rval;
803 }
804
805 STATIC int
806 xfs_btree_readahead_sblock(
807         struct xfs_btree_cur    *cur,
808         int                     lr,
809         struct xfs_btree_block *block)
810 {
811         int                     rval = 0;
812         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
813         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
814
815
816         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
817                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
818                                      left, 1, cur->bc_ops->buf_ops);
819                 rval++;
820         }
821
822         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
823                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
824                                      right, 1, cur->bc_ops->buf_ops);
825                 rval++;
826         }
827
828         return rval;
829 }
830
831 /*
832  * Read-ahead btree blocks, at the given level.
833  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
834  */
835 STATIC int
836 xfs_btree_readahead(
837         struct xfs_btree_cur    *cur,           /* btree cursor */
838         int                     lev,            /* level in btree */
839         int                     lr)             /* left/right bits */
840 {
841         struct xfs_btree_block  *block;
842
843         /*
844          * No readahead needed if we are at the root level and the
845          * btree root is stored in the inode.
846          */
847         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
848             (lev == cur->bc_nlevels - 1))
849                 return 0;
850
851         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
852                 return 0;
853
854         cur->bc_ra[lev] |= lr;
855         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
856
857         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
858                 return xfs_btree_readahead_lblock(cur, lr, block);
859         return xfs_btree_readahead_sblock(cur, lr, block);
860 }
861
862 STATIC xfs_daddr_t
863 xfs_btree_ptr_to_daddr(
864         struct xfs_btree_cur    *cur,
865         union xfs_btree_ptr     *ptr)
866 {
867         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
868                 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
869
870                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
871         } else {
872                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
873                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
874
875                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
876                                         be32_to_cpu(ptr->s));
877         }
878 }
879
880 /*
881  * Readahead @count btree blocks at the given @ptr location.
882  *
883  * We don't need to care about long or short form btrees here as we have a
884  * method of converting the ptr directly to a daddr available to us.
885  */
886 STATIC void
887 xfs_btree_readahead_ptr(
888         struct xfs_btree_cur    *cur,
889         union xfs_btree_ptr     *ptr,
890         xfs_extlen_t            count)
891 {
892         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
893                           xfs_btree_ptr_to_daddr(cur, ptr),
894                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
895 }
896
897 /*
898  * Set the buffer for level "lev" in the cursor to bp, releasing
899  * any previous buffer.
900  */
901 STATIC void
902 xfs_btree_setbuf(
903         xfs_btree_cur_t         *cur,   /* btree cursor */
904         int                     lev,    /* level in btree */
905         xfs_buf_t               *bp)    /* new buffer to set */
906 {
907         struct xfs_btree_block  *b;     /* btree block */
908
909         if (cur->bc_bufs[lev])
910                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
911         cur->bc_bufs[lev] = bp;
912         cur->bc_ra[lev] = 0;
913
914         b = XFS_BUF_TO_BLOCK(bp);
915         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
916                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
917                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
918                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
919                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
920         } else {
921                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
922                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
923                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
924                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
925         }
926 }
927
928 STATIC int
929 xfs_btree_ptr_is_null(
930         struct xfs_btree_cur    *cur,
931         union xfs_btree_ptr     *ptr)
932 {
933         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
934                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
935         else
936                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
937 }
938
939 STATIC void
940 xfs_btree_set_ptr_null(
941         struct xfs_btree_cur    *cur,
942         union xfs_btree_ptr     *ptr)
943 {
944         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
945                 ptr->l = cpu_to_be64(NULLFSBLOCK);
946         else
947                 ptr->s = cpu_to_be32(NULLAGBLOCK);
948 }
949
950 /*
951  * Get/set/init sibling pointers
952  */
953 STATIC void
954 xfs_btree_get_sibling(
955         struct xfs_btree_cur    *cur,
956         struct xfs_btree_block  *block,
957         union xfs_btree_ptr     *ptr,
958         int                     lr)
959 {
960         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
961
962         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
963                 if (lr == XFS_BB_RIGHTSIB)
964                         ptr->l = block->bb_u.l.bb_rightsib;
965                 else
966                         ptr->l = block->bb_u.l.bb_leftsib;
967         } else {
968                 if (lr == XFS_BB_RIGHTSIB)
969                         ptr->s = block->bb_u.s.bb_rightsib;
970                 else
971                         ptr->s = block->bb_u.s.bb_leftsib;
972         }
973 }
974
975 STATIC void
976 xfs_btree_set_sibling(
977         struct xfs_btree_cur    *cur,
978         struct xfs_btree_block  *block,
979         union xfs_btree_ptr     *ptr,
980         int                     lr)
981 {
982         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
983
984         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
985                 if (lr == XFS_BB_RIGHTSIB)
986                         block->bb_u.l.bb_rightsib = ptr->l;
987                 else
988                         block->bb_u.l.bb_leftsib = ptr->l;
989         } else {
990                 if (lr == XFS_BB_RIGHTSIB)
991                         block->bb_u.s.bb_rightsib = ptr->s;
992                 else
993                         block->bb_u.s.bb_leftsib = ptr->s;
994         }
995 }
996
997 void
998 xfs_btree_init_block_int(
999         struct xfs_mount        *mp,
1000         struct xfs_btree_block  *buf,
1001         xfs_daddr_t             blkno,
1002         __u32                   magic,
1003         __u16                   level,
1004         __u16                   numrecs,
1005         __u64                   owner,
1006         unsigned int            flags)
1007 {
1008         buf->bb_magic = cpu_to_be32(magic);
1009         buf->bb_level = cpu_to_be16(level);
1010         buf->bb_numrecs = cpu_to_be16(numrecs);
1011
1012         if (flags & XFS_BTREE_LONG_PTRS) {
1013                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1014                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1015                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1016                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1017                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1018                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1019                         buf->bb_u.l.bb_pad = 0;
1020                         buf->bb_u.l.bb_lsn = 0;
1021                 }
1022         } else {
1023                 /* owner is a 32 bit value on short blocks */
1024                 __u32 __owner = (__u32)owner;
1025
1026                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1027                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1028                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1029                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1030                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1031                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1032                         buf->bb_u.s.bb_lsn = 0;
1033                 }
1034         }
1035 }
1036
1037 void
1038 xfs_btree_init_block(
1039         struct xfs_mount *mp,
1040         struct xfs_buf  *bp,
1041         __u32           magic,
1042         __u16           level,
1043         __u16           numrecs,
1044         __u64           owner,
1045         unsigned int    flags)
1046 {
1047         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1048                                  magic, level, numrecs, owner, flags);
1049 }
1050
1051 STATIC void
1052 xfs_btree_init_block_cur(
1053         struct xfs_btree_cur    *cur,
1054         struct xfs_buf          *bp,
1055         int                     level,
1056         int                     numrecs)
1057 {
1058         __u64 owner;
1059
1060         /*
1061          * we can pull the owner from the cursor right now as the different
1062          * owners align directly with the pointer size of the btree. This may
1063          * change in future, but is safe for current users of the generic btree
1064          * code.
1065          */
1066         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1067                 owner = cur->bc_private.b.ip->i_ino;
1068         else
1069                 owner = cur->bc_private.a.agno;
1070
1071         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1072                                  xfs_btree_magic(cur), level, numrecs,
1073                                  owner, cur->bc_flags);
1074 }
1075
1076 /*
1077  * Return true if ptr is the last record in the btree and
1078  * we need to track updates to this record.  The decision
1079  * will be further refined in the update_lastrec method.
1080  */
1081 STATIC int
1082 xfs_btree_is_lastrec(
1083         struct xfs_btree_cur    *cur,
1084         struct xfs_btree_block  *block,
1085         int                     level)
1086 {
1087         union xfs_btree_ptr     ptr;
1088
1089         if (level > 0)
1090                 return 0;
1091         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1092                 return 0;
1093
1094         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1095         if (!xfs_btree_ptr_is_null(cur, &ptr))
1096                 return 0;
1097         return 1;
1098 }
1099
1100 STATIC void
1101 xfs_btree_buf_to_ptr(
1102         struct xfs_btree_cur    *cur,
1103         struct xfs_buf          *bp,
1104         union xfs_btree_ptr     *ptr)
1105 {
1106         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1107                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1108                                         XFS_BUF_ADDR(bp)));
1109         else {
1110                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1111                                         XFS_BUF_ADDR(bp)));
1112         }
1113 }
1114
1115 STATIC void
1116 xfs_btree_set_refs(
1117         struct xfs_btree_cur    *cur,
1118         struct xfs_buf          *bp)
1119 {
1120         switch (cur->bc_btnum) {
1121         case XFS_BTNUM_BNO:
1122         case XFS_BTNUM_CNT:
1123                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1124                 break;
1125         case XFS_BTNUM_INO:
1126         case XFS_BTNUM_FINO:
1127                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1128                 break;
1129         case XFS_BTNUM_BMAP:
1130                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1131                 break;
1132         default:
1133                 ASSERT(0);
1134         }
1135 }
1136
1137 STATIC int
1138 xfs_btree_get_buf_block(
1139         struct xfs_btree_cur    *cur,
1140         union xfs_btree_ptr     *ptr,
1141         int                     flags,
1142         struct xfs_btree_block  **block,
1143         struct xfs_buf          **bpp)
1144 {
1145         struct xfs_mount        *mp = cur->bc_mp;
1146         xfs_daddr_t             d;
1147
1148         /* need to sort out how callers deal with failures first */
1149         ASSERT(!(flags & XBF_TRYLOCK));
1150
1151         d = xfs_btree_ptr_to_daddr(cur, ptr);
1152         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1153                                  mp->m_bsize, flags);
1154
1155         if (!*bpp)
1156                 return -ENOMEM;
1157
1158         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1159         *block = XFS_BUF_TO_BLOCK(*bpp);
1160         return 0;
1161 }
1162
1163 /*
1164  * Read in the buffer at the given ptr and return the buffer and
1165  * the block pointer within the buffer.
1166  */
1167 STATIC int
1168 xfs_btree_read_buf_block(
1169         struct xfs_btree_cur    *cur,
1170         union xfs_btree_ptr     *ptr,
1171         int                     flags,
1172         struct xfs_btree_block  **block,
1173         struct xfs_buf          **bpp)
1174 {
1175         struct xfs_mount        *mp = cur->bc_mp;
1176         xfs_daddr_t             d;
1177         int                     error;
1178
1179         /* need to sort out how callers deal with failures first */
1180         ASSERT(!(flags & XBF_TRYLOCK));
1181
1182         d = xfs_btree_ptr_to_daddr(cur, ptr);
1183         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1184                                    mp->m_bsize, flags, bpp,
1185                                    cur->bc_ops->buf_ops);
1186         if (error)
1187                 return error;
1188
1189         xfs_btree_set_refs(cur, *bpp);
1190         *block = XFS_BUF_TO_BLOCK(*bpp);
1191         return 0;
1192 }
1193
1194 /*
1195  * Copy keys from one btree block to another.
1196  */
1197 STATIC void
1198 xfs_btree_copy_keys(
1199         struct xfs_btree_cur    *cur,
1200         union xfs_btree_key     *dst_key,
1201         union xfs_btree_key     *src_key,
1202         int                     numkeys)
1203 {
1204         ASSERT(numkeys >= 0);
1205         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1206 }
1207
1208 /*
1209  * Copy records from one btree block to another.
1210  */
1211 STATIC void
1212 xfs_btree_copy_recs(
1213         struct xfs_btree_cur    *cur,
1214         union xfs_btree_rec     *dst_rec,
1215         union xfs_btree_rec     *src_rec,
1216         int                     numrecs)
1217 {
1218         ASSERT(numrecs >= 0);
1219         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1220 }
1221
1222 /*
1223  * Copy block pointers from one btree block to another.
1224  */
1225 STATIC void
1226 xfs_btree_copy_ptrs(
1227         struct xfs_btree_cur    *cur,
1228         union xfs_btree_ptr     *dst_ptr,
1229         union xfs_btree_ptr     *src_ptr,
1230         int                     numptrs)
1231 {
1232         ASSERT(numptrs >= 0);
1233         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1234 }
1235
1236 /*
1237  * Shift keys one index left/right inside a single btree block.
1238  */
1239 STATIC void
1240 xfs_btree_shift_keys(
1241         struct xfs_btree_cur    *cur,
1242         union xfs_btree_key     *key,
1243         int                     dir,
1244         int                     numkeys)
1245 {
1246         char                    *dst_key;
1247
1248         ASSERT(numkeys >= 0);
1249         ASSERT(dir == 1 || dir == -1);
1250
1251         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1252         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1253 }
1254
1255 /*
1256  * Shift records one index left/right inside a single btree block.
1257  */
1258 STATIC void
1259 xfs_btree_shift_recs(
1260         struct xfs_btree_cur    *cur,
1261         union xfs_btree_rec     *rec,
1262         int                     dir,
1263         int                     numrecs)
1264 {
1265         char                    *dst_rec;
1266
1267         ASSERT(numrecs >= 0);
1268         ASSERT(dir == 1 || dir == -1);
1269
1270         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1271         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1272 }
1273
1274 /*
1275  * Shift block pointers one index left/right inside a single btree block.
1276  */
1277 STATIC void
1278 xfs_btree_shift_ptrs(
1279         struct xfs_btree_cur    *cur,
1280         union xfs_btree_ptr     *ptr,
1281         int                     dir,
1282         int                     numptrs)
1283 {
1284         char                    *dst_ptr;
1285
1286         ASSERT(numptrs >= 0);
1287         ASSERT(dir == 1 || dir == -1);
1288
1289         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1290         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1291 }
1292
1293 /*
1294  * Log key values from the btree block.
1295  */
1296 STATIC void
1297 xfs_btree_log_keys(
1298         struct xfs_btree_cur    *cur,
1299         struct xfs_buf          *bp,
1300         int                     first,
1301         int                     last)
1302 {
1303         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1304         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1305
1306         if (bp) {
1307                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1308                 xfs_trans_log_buf(cur->bc_tp, bp,
1309                                   xfs_btree_key_offset(cur, first),
1310                                   xfs_btree_key_offset(cur, last + 1) - 1);
1311         } else {
1312                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1313                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1314         }
1315
1316         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1317 }
1318
1319 /*
1320  * Log record values from the btree block.
1321  */
1322 void
1323 xfs_btree_log_recs(
1324         struct xfs_btree_cur    *cur,
1325         struct xfs_buf          *bp,
1326         int                     first,
1327         int                     last)
1328 {
1329         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1330         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1331
1332         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1333         xfs_trans_log_buf(cur->bc_tp, bp,
1334                           xfs_btree_rec_offset(cur, first),
1335                           xfs_btree_rec_offset(cur, last + 1) - 1);
1336
1337         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1338 }
1339
1340 /*
1341  * Log block pointer fields from a btree block (nonleaf).
1342  */
1343 STATIC void
1344 xfs_btree_log_ptrs(
1345         struct xfs_btree_cur    *cur,   /* btree cursor */
1346         struct xfs_buf          *bp,    /* buffer containing btree block */
1347         int                     first,  /* index of first pointer to log */
1348         int                     last)   /* index of last pointer to log */
1349 {
1350         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1351         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1352
1353         if (bp) {
1354                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1355                 int                     level = xfs_btree_get_level(block);
1356
1357                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1358                 xfs_trans_log_buf(cur->bc_tp, bp,
1359                                 xfs_btree_ptr_offset(cur, first, level),
1360                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1361         } else {
1362                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1363                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1364         }
1365
1366         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1367 }
1368
1369 /*
1370  * Log fields from a btree block header.
1371  */
1372 void
1373 xfs_btree_log_block(
1374         struct xfs_btree_cur    *cur,   /* btree cursor */
1375         struct xfs_buf          *bp,    /* buffer containing btree block */
1376         int                     fields) /* mask of fields: XFS_BB_... */
1377 {
1378         int                     first;  /* first byte offset logged */
1379         int                     last;   /* last byte offset logged */
1380         static const short      soffsets[] = {  /* table of offsets (short) */
1381                 offsetof(struct xfs_btree_block, bb_magic),
1382                 offsetof(struct xfs_btree_block, bb_level),
1383                 offsetof(struct xfs_btree_block, bb_numrecs),
1384                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1385                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1386                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1387                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1388                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1389                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1390                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1391                 XFS_BTREE_SBLOCK_CRC_LEN
1392         };
1393         static const short      loffsets[] = {  /* table of offsets (long) */
1394                 offsetof(struct xfs_btree_block, bb_magic),
1395                 offsetof(struct xfs_btree_block, bb_level),
1396                 offsetof(struct xfs_btree_block, bb_numrecs),
1397                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1398                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1399                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1400                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1401                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1402                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1403                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1404                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1405                 XFS_BTREE_LBLOCK_CRC_LEN
1406         };
1407
1408         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1409         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1410
1411         if (bp) {
1412                 int nbits;
1413
1414                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1415                         /*
1416                          * We don't log the CRC when updating a btree
1417                          * block but instead recreate it during log
1418                          * recovery.  As the log buffers have checksums
1419                          * of their own this is safe and avoids logging a crc
1420                          * update in a lot of places.
1421                          */
1422                         if (fields == XFS_BB_ALL_BITS)
1423                                 fields = XFS_BB_ALL_BITS_CRC;
1424                         nbits = XFS_BB_NUM_BITS_CRC;
1425                 } else {
1426                         nbits = XFS_BB_NUM_BITS;
1427                 }
1428                 xfs_btree_offsets(fields,
1429                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1430                                         loffsets : soffsets,
1431                                   nbits, &first, &last);
1432                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1433                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1434         } else {
1435                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1436                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1437         }
1438
1439         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1440 }
1441
1442 /*
1443  * Increment cursor by one record at the level.
1444  * For nonzero levels the leaf-ward information is untouched.
1445  */
1446 int                                             /* error */
1447 xfs_btree_increment(
1448         struct xfs_btree_cur    *cur,
1449         int                     level,
1450         int                     *stat)          /* success/failure */
1451 {
1452         struct xfs_btree_block  *block;
1453         union xfs_btree_ptr     ptr;
1454         struct xfs_buf          *bp;
1455         int                     error;          /* error return value */
1456         int                     lev;
1457
1458         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1459         XFS_BTREE_TRACE_ARGI(cur, level);
1460
1461         ASSERT(level < cur->bc_nlevels);
1462
1463         /* Read-ahead to the right at this level. */
1464         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1465
1466         /* Get a pointer to the btree block. */
1467         block = xfs_btree_get_block(cur, level, &bp);
1468
1469 #ifdef DEBUG
1470         error = xfs_btree_check_block(cur, block, level, bp);
1471         if (error)
1472                 goto error0;
1473 #endif
1474
1475         /* We're done if we remain in the block after the increment. */
1476         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1477                 goto out1;
1478
1479         /* Fail if we just went off the right edge of the tree. */
1480         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1481         if (xfs_btree_ptr_is_null(cur, &ptr))
1482                 goto out0;
1483
1484         XFS_BTREE_STATS_INC(cur, increment);
1485
1486         /*
1487          * March up the tree incrementing pointers.
1488          * Stop when we don't go off the right edge of a block.
1489          */
1490         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1491                 block = xfs_btree_get_block(cur, lev, &bp);
1492
1493 #ifdef DEBUG
1494                 error = xfs_btree_check_block(cur, block, lev, bp);
1495                 if (error)
1496                         goto error0;
1497 #endif
1498
1499                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1500                         break;
1501
1502                 /* Read-ahead the right block for the next loop. */
1503                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1504         }
1505
1506         /*
1507          * If we went off the root then we are either seriously
1508          * confused or have the tree root in an inode.
1509          */
1510         if (lev == cur->bc_nlevels) {
1511                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1512                         goto out0;
1513                 ASSERT(0);
1514                 error = -EFSCORRUPTED;
1515                 goto error0;
1516         }
1517         ASSERT(lev < cur->bc_nlevels);
1518
1519         /*
1520          * Now walk back down the tree, fixing up the cursor's buffer
1521          * pointers and key numbers.
1522          */
1523         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1524                 union xfs_btree_ptr     *ptrp;
1525
1526                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1527                 --lev;
1528                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1529                 if (error)
1530                         goto error0;
1531
1532                 xfs_btree_setbuf(cur, lev, bp);
1533                 cur->bc_ptrs[lev] = 1;
1534         }
1535 out1:
1536         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1537         *stat = 1;
1538         return 0;
1539
1540 out0:
1541         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1542         *stat = 0;
1543         return 0;
1544
1545 error0:
1546         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1547         return error;
1548 }
1549
1550 /*
1551  * Decrement cursor by one record at the level.
1552  * For nonzero levels the leaf-ward information is untouched.
1553  */
1554 int                                             /* error */
1555 xfs_btree_decrement(
1556         struct xfs_btree_cur    *cur,
1557         int                     level,
1558         int                     *stat)          /* success/failure */
1559 {
1560         struct xfs_btree_block  *block;
1561         xfs_buf_t               *bp;
1562         int                     error;          /* error return value */
1563         int                     lev;
1564         union xfs_btree_ptr     ptr;
1565
1566         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1567         XFS_BTREE_TRACE_ARGI(cur, level);
1568
1569         ASSERT(level < cur->bc_nlevels);
1570
1571         /* Read-ahead to the left at this level. */
1572         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1573
1574         /* We're done if we remain in the block after the decrement. */
1575         if (--cur->bc_ptrs[level] > 0)
1576                 goto out1;
1577
1578         /* Get a pointer to the btree block. */
1579         block = xfs_btree_get_block(cur, level, &bp);
1580
1581 #ifdef DEBUG
1582         error = xfs_btree_check_block(cur, block, level, bp);
1583         if (error)
1584                 goto error0;
1585 #endif
1586
1587         /* Fail if we just went off the left edge of the tree. */
1588         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1589         if (xfs_btree_ptr_is_null(cur, &ptr))
1590                 goto out0;
1591
1592         XFS_BTREE_STATS_INC(cur, decrement);
1593
1594         /*
1595          * March up the tree decrementing pointers.
1596          * Stop when we don't go off the left edge of a block.
1597          */
1598         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1599                 if (--cur->bc_ptrs[lev] > 0)
1600                         break;
1601                 /* Read-ahead the left block for the next loop. */
1602                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1603         }
1604
1605         /*
1606          * If we went off the root then we are seriously confused.
1607          * or the root of the tree is in an inode.
1608          */
1609         if (lev == cur->bc_nlevels) {
1610                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1611                         goto out0;
1612                 ASSERT(0);
1613                 error = -EFSCORRUPTED;
1614                 goto error0;
1615         }
1616         ASSERT(lev < cur->bc_nlevels);
1617
1618         /*
1619          * Now walk back down the tree, fixing up the cursor's buffer
1620          * pointers and key numbers.
1621          */
1622         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1623                 union xfs_btree_ptr     *ptrp;
1624
1625                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1626                 --lev;
1627                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1628                 if (error)
1629                         goto error0;
1630                 xfs_btree_setbuf(cur, lev, bp);
1631                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1632         }
1633 out1:
1634         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1635         *stat = 1;
1636         return 0;
1637
1638 out0:
1639         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1640         *stat = 0;
1641         return 0;
1642
1643 error0:
1644         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1645         return error;
1646 }
1647
1648 STATIC int
1649 xfs_btree_lookup_get_block(
1650         struct xfs_btree_cur    *cur,   /* btree cursor */
1651         int                     level,  /* level in the btree */
1652         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1653         struct xfs_btree_block  **blkp) /* return btree block */
1654 {
1655         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1656         int                     error = 0;
1657
1658         /* special case the root block if in an inode */
1659         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1660             (level == cur->bc_nlevels - 1)) {
1661                 *blkp = xfs_btree_get_iroot(cur);
1662                 return 0;
1663         }
1664
1665         /*
1666          * If the old buffer at this level for the disk address we are
1667          * looking for re-use it.
1668          *
1669          * Otherwise throw it away and get a new one.
1670          */
1671         bp = cur->bc_bufs[level];
1672         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1673                 *blkp = XFS_BUF_TO_BLOCK(bp);
1674                 return 0;
1675         }
1676
1677         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1678         if (error)
1679                 return error;
1680
1681         xfs_btree_setbuf(cur, level, bp);
1682         return 0;
1683 }
1684
1685 /*
1686  * Get current search key.  For level 0 we don't actually have a key
1687  * structure so we make one up from the record.  For all other levels
1688  * we just return the right key.
1689  */
1690 STATIC union xfs_btree_key *
1691 xfs_lookup_get_search_key(
1692         struct xfs_btree_cur    *cur,
1693         int                     level,
1694         int                     keyno,
1695         struct xfs_btree_block  *block,
1696         union xfs_btree_key     *kp)
1697 {
1698         if (level == 0) {
1699                 cur->bc_ops->init_key_from_rec(kp,
1700                                 xfs_btree_rec_addr(cur, keyno, block));
1701                 return kp;
1702         }
1703
1704         return xfs_btree_key_addr(cur, keyno, block);
1705 }
1706
1707 /*
1708  * Lookup the record.  The cursor is made to point to it, based on dir.
1709  * stat is set to 0 if can't find any such record, 1 for success.
1710  */
1711 int                                     /* error */
1712 xfs_btree_lookup(
1713         struct xfs_btree_cur    *cur,   /* btree cursor */
1714         xfs_lookup_t            dir,    /* <=, ==, or >= */
1715         int                     *stat)  /* success/failure */
1716 {
1717         struct xfs_btree_block  *block; /* current btree block */
1718         __int64_t               diff;   /* difference for the current key */
1719         int                     error;  /* error return value */
1720         int                     keyno;  /* current key number */
1721         int                     level;  /* level in the btree */
1722         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1723         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1724
1725         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1726         XFS_BTREE_TRACE_ARGI(cur, dir);
1727
1728         XFS_BTREE_STATS_INC(cur, lookup);
1729
1730         block = NULL;
1731         keyno = 0;
1732
1733         /* initialise start pointer from cursor */
1734         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1735         pp = &ptr;
1736
1737         /*
1738          * Iterate over each level in the btree, starting at the root.
1739          * For each level above the leaves, find the key we need, based
1740          * on the lookup record, then follow the corresponding block
1741          * pointer down to the next level.
1742          */
1743         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1744                 /* Get the block we need to do the lookup on. */
1745                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1746                 if (error)
1747                         goto error0;
1748
1749                 if (diff == 0) {
1750                         /*
1751                          * If we already had a key match at a higher level, we
1752                          * know we need to use the first entry in this block.
1753                          */
1754                         keyno = 1;
1755                 } else {
1756                         /* Otherwise search this block. Do a binary search. */
1757
1758                         int     high;   /* high entry number */
1759                         int     low;    /* low entry number */
1760
1761                         /* Set low and high entry numbers, 1-based. */
1762                         low = 1;
1763                         high = xfs_btree_get_numrecs(block);
1764                         if (!high) {
1765                                 /* Block is empty, must be an empty leaf. */
1766                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1767
1768                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1769                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1770                                 *stat = 0;
1771                                 return 0;
1772                         }
1773
1774                         /* Binary search the block. */
1775                         while (low <= high) {
1776                                 union xfs_btree_key     key;
1777                                 union xfs_btree_key     *kp;
1778
1779                                 XFS_BTREE_STATS_INC(cur, compare);
1780
1781                                 /* keyno is average of low and high. */
1782                                 keyno = (low + high) >> 1;
1783
1784                                 /* Get current search key */
1785                                 kp = xfs_lookup_get_search_key(cur, level,
1786                                                 keyno, block, &key);
1787
1788                                 /*
1789                                  * Compute difference to get next direction:
1790                                  *  - less than, move right
1791                                  *  - greater than, move left
1792                                  *  - equal, we're done
1793                                  */
1794                                 diff = cur->bc_ops->key_diff(cur, kp);
1795                                 if (diff < 0)
1796                                         low = keyno + 1;
1797                                 else if (diff > 0)
1798                                         high = keyno - 1;
1799                                 else
1800                                         break;
1801                         }
1802                 }
1803
1804                 /*
1805                  * If there are more levels, set up for the next level
1806                  * by getting the block number and filling in the cursor.
1807                  */
1808                 if (level > 0) {
1809                         /*
1810                          * If we moved left, need the previous key number,
1811                          * unless there isn't one.
1812                          */
1813                         if (diff > 0 && --keyno < 1)
1814                                 keyno = 1;
1815                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1816
1817 #ifdef DEBUG
1818                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1819                         if (error)
1820                                 goto error0;
1821 #endif
1822                         cur->bc_ptrs[level] = keyno;
1823                 }
1824         }
1825
1826         /* Done with the search. See if we need to adjust the results. */
1827         if (dir != XFS_LOOKUP_LE && diff < 0) {
1828                 keyno++;
1829                 /*
1830                  * If ge search and we went off the end of the block, but it's
1831                  * not the last block, we're in the wrong block.
1832                  */
1833                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1834                 if (dir == XFS_LOOKUP_GE &&
1835                     keyno > xfs_btree_get_numrecs(block) &&
1836                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1837                         int     i;
1838
1839                         cur->bc_ptrs[0] = keyno;
1840                         error = xfs_btree_increment(cur, 0, &i);
1841                         if (error)
1842                                 goto error0;
1843                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1844                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1845                         *stat = 1;
1846                         return 0;
1847                 }
1848         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1849                 keyno--;
1850         cur->bc_ptrs[0] = keyno;
1851
1852         /* Return if we succeeded or not. */
1853         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1854                 *stat = 0;
1855         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1856                 *stat = 1;
1857         else
1858                 *stat = 0;
1859         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1860         return 0;
1861
1862 error0:
1863         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1864         return error;
1865 }
1866
1867 /*
1868  * Update keys at all levels from here to the root along the cursor's path.
1869  */
1870 STATIC int
1871 xfs_btree_updkey(
1872         struct xfs_btree_cur    *cur,
1873         union xfs_btree_key     *keyp,
1874         int                     level)
1875 {
1876         struct xfs_btree_block  *block;
1877         struct xfs_buf          *bp;
1878         union xfs_btree_key     *kp;
1879         int                     ptr;
1880
1881         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1882         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1883
1884         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1885
1886         /*
1887          * Go up the tree from this level toward the root.
1888          * At each level, update the key value to the value input.
1889          * Stop when we reach a level where the cursor isn't pointing
1890          * at the first entry in the block.
1891          */
1892         for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1893 #ifdef DEBUG
1894                 int             error;
1895 #endif
1896                 block = xfs_btree_get_block(cur, level, &bp);
1897 #ifdef DEBUG
1898                 error = xfs_btree_check_block(cur, block, level, bp);
1899                 if (error) {
1900                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1901                         return error;
1902                 }
1903 #endif
1904                 ptr = cur->bc_ptrs[level];
1905                 kp = xfs_btree_key_addr(cur, ptr, block);
1906                 xfs_btree_copy_keys(cur, kp, keyp, 1);
1907                 xfs_btree_log_keys(cur, bp, ptr, ptr);
1908         }
1909
1910         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1911         return 0;
1912 }
1913
1914 /*
1915  * Update the record referred to by cur to the value in the
1916  * given record. This either works (return 0) or gets an
1917  * EFSCORRUPTED error.
1918  */
1919 int
1920 xfs_btree_update(
1921         struct xfs_btree_cur    *cur,
1922         union xfs_btree_rec     *rec)
1923 {
1924         struct xfs_btree_block  *block;
1925         struct xfs_buf          *bp;
1926         int                     error;
1927         int                     ptr;
1928         union xfs_btree_rec     *rp;
1929
1930         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1931         XFS_BTREE_TRACE_ARGR(cur, rec);
1932
1933         /* Pick up the current block. */
1934         block = xfs_btree_get_block(cur, 0, &bp);
1935
1936 #ifdef DEBUG
1937         error = xfs_btree_check_block(cur, block, 0, bp);
1938         if (error)
1939                 goto error0;
1940 #endif
1941         /* Get the address of the rec to be updated. */
1942         ptr = cur->bc_ptrs[0];
1943         rp = xfs_btree_rec_addr(cur, ptr, block);
1944
1945         /* Fill in the new contents and log them. */
1946         xfs_btree_copy_recs(cur, rp, rec, 1);
1947         xfs_btree_log_recs(cur, bp, ptr, ptr);
1948
1949         /*
1950          * If we are tracking the last record in the tree and
1951          * we are at the far right edge of the tree, update it.
1952          */
1953         if (xfs_btree_is_lastrec(cur, block, 0)) {
1954                 cur->bc_ops->update_lastrec(cur, block, rec,
1955                                             ptr, LASTREC_UPDATE);
1956         }
1957
1958         /* Updating first rec in leaf. Pass new key value up to our parent. */
1959         if (ptr == 1) {
1960                 union xfs_btree_key     key;
1961
1962                 cur->bc_ops->init_key_from_rec(&key, rec);
1963                 error = xfs_btree_updkey(cur, &key, 1);
1964                 if (error)
1965                         goto error0;
1966         }
1967
1968         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1969         return 0;
1970
1971 error0:
1972         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1973         return error;
1974 }
1975
1976 /*
1977  * Move 1 record left from cur/level if possible.
1978  * Update cur to reflect the new path.
1979  */
1980 STATIC int                                      /* error */
1981 xfs_btree_lshift(
1982         struct xfs_btree_cur    *cur,
1983         int                     level,
1984         int                     *stat)          /* success/failure */
1985 {
1986         union xfs_btree_key     key;            /* btree key */
1987         struct xfs_buf          *lbp;           /* left buffer pointer */
1988         struct xfs_btree_block  *left;          /* left btree block */
1989         int                     lrecs;          /* left record count */
1990         struct xfs_buf          *rbp;           /* right buffer pointer */
1991         struct xfs_btree_block  *right;         /* right btree block */
1992         int                     rrecs;          /* right record count */
1993         union xfs_btree_ptr     lptr;           /* left btree pointer */
1994         union xfs_btree_key     *rkp = NULL;    /* right btree key */
1995         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
1996         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
1997         int                     error;          /* error return value */
1998
1999         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2000         XFS_BTREE_TRACE_ARGI(cur, level);
2001
2002         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2003             level == cur->bc_nlevels - 1)
2004                 goto out0;
2005
2006         /* Set up variables for this block as "right". */
2007         right = xfs_btree_get_block(cur, level, &rbp);
2008
2009 #ifdef DEBUG
2010         error = xfs_btree_check_block(cur, right, level, rbp);
2011         if (error)
2012                 goto error0;
2013 #endif
2014
2015         /* If we've got no left sibling then we can't shift an entry left. */
2016         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2017         if (xfs_btree_ptr_is_null(cur, &lptr))
2018                 goto out0;
2019
2020         /*
2021          * If the cursor entry is the one that would be moved, don't
2022          * do it... it's too complicated.
2023          */
2024         if (cur->bc_ptrs[level] <= 1)
2025                 goto out0;
2026
2027         /* Set up the left neighbor as "left". */
2028         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2029         if (error)
2030                 goto error0;
2031
2032         /* If it's full, it can't take another entry. */
2033         lrecs = xfs_btree_get_numrecs(left);
2034         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2035                 goto out0;
2036
2037         rrecs = xfs_btree_get_numrecs(right);
2038
2039         /*
2040          * We add one entry to the left side and remove one for the right side.
2041          * Account for it here, the changes will be updated on disk and logged
2042          * later.
2043          */
2044         lrecs++;
2045         rrecs--;
2046
2047         XFS_BTREE_STATS_INC(cur, lshift);
2048         XFS_BTREE_STATS_ADD(cur, moves, 1);
2049
2050         /*
2051          * If non-leaf, copy a key and a ptr to the left block.
2052          * Log the changes to the left block.
2053          */
2054         if (level > 0) {
2055                 /* It's a non-leaf.  Move keys and pointers. */
2056                 union xfs_btree_key     *lkp;   /* left btree key */
2057                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2058
2059                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2060                 rkp = xfs_btree_key_addr(cur, 1, right);
2061
2062                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2063                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2064 #ifdef DEBUG
2065                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2066                 if (error)
2067                         goto error0;
2068 #endif
2069                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2070                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2071
2072                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2073                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2074
2075                 ASSERT(cur->bc_ops->keys_inorder(cur,
2076                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2077         } else {
2078                 /* It's a leaf.  Move records.  */
2079                 union xfs_btree_rec     *lrp;   /* left record pointer */
2080
2081                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2082                 rrp = xfs_btree_rec_addr(cur, 1, right);
2083
2084                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2085                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2086
2087                 ASSERT(cur->bc_ops->recs_inorder(cur,
2088                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2089         }
2090
2091         xfs_btree_set_numrecs(left, lrecs);
2092         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2093
2094         xfs_btree_set_numrecs(right, rrecs);
2095         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2096
2097         /*
2098          * Slide the contents of right down one entry.
2099          */
2100         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2101         if (level > 0) {
2102                 /* It's a nonleaf. operate on keys and ptrs */
2103 #ifdef DEBUG
2104                 int                     i;              /* loop index */
2105
2106                 for (i = 0; i < rrecs; i++) {
2107                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2108                         if (error)
2109                                 goto error0;
2110                 }
2111 #endif
2112                 xfs_btree_shift_keys(cur,
2113                                 xfs_btree_key_addr(cur, 2, right),
2114                                 -1, rrecs);
2115                 xfs_btree_shift_ptrs(cur,
2116                                 xfs_btree_ptr_addr(cur, 2, right),
2117                                 -1, rrecs);
2118
2119                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2120                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2121         } else {
2122                 /* It's a leaf. operate on records */
2123                 xfs_btree_shift_recs(cur,
2124                         xfs_btree_rec_addr(cur, 2, right),
2125                         -1, rrecs);
2126                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2127
2128                 /*
2129                  * If it's the first record in the block, we'll need a key
2130                  * structure to pass up to the next level (updkey).
2131                  */
2132                 cur->bc_ops->init_key_from_rec(&key,
2133                         xfs_btree_rec_addr(cur, 1, right));
2134                 rkp = &key;
2135         }
2136
2137         /* Update the parent key values of right. */
2138         error = xfs_btree_updkey(cur, rkp, level + 1);
2139         if (error)
2140                 goto error0;
2141
2142         /* Slide the cursor value left one. */
2143         cur->bc_ptrs[level]--;
2144
2145         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2146         *stat = 1;
2147         return 0;
2148
2149 out0:
2150         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2151         *stat = 0;
2152         return 0;
2153
2154 error0:
2155         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2156         return error;
2157 }
2158
2159 /*
2160  * Move 1 record right from cur/level if possible.
2161  * Update cur to reflect the new path.
2162  */
2163 STATIC int                                      /* error */
2164 xfs_btree_rshift(
2165         struct xfs_btree_cur    *cur,
2166         int                     level,
2167         int                     *stat)          /* success/failure */
2168 {
2169         union xfs_btree_key     key;            /* btree key */
2170         struct xfs_buf          *lbp;           /* left buffer pointer */
2171         struct xfs_btree_block  *left;          /* left btree block */
2172         struct xfs_buf          *rbp;           /* right buffer pointer */
2173         struct xfs_btree_block  *right;         /* right btree block */
2174         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2175         union xfs_btree_ptr     rptr;           /* right block pointer */
2176         union xfs_btree_key     *rkp;           /* right btree key */
2177         int                     rrecs;          /* right record count */
2178         int                     lrecs;          /* left record count */
2179         int                     error;          /* error return value */
2180         int                     i;              /* loop counter */
2181
2182         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2183         XFS_BTREE_TRACE_ARGI(cur, level);
2184
2185         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2186             (level == cur->bc_nlevels - 1))
2187                 goto out0;
2188
2189         /* Set up variables for this block as "left". */
2190         left = xfs_btree_get_block(cur, level, &lbp);
2191
2192 #ifdef DEBUG
2193         error = xfs_btree_check_block(cur, left, level, lbp);
2194         if (error)
2195                 goto error0;
2196 #endif
2197
2198         /* If we've got no right sibling then we can't shift an entry right. */
2199         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2200         if (xfs_btree_ptr_is_null(cur, &rptr))
2201                 goto out0;
2202
2203         /*
2204          * If the cursor entry is the one that would be moved, don't
2205          * do it... it's too complicated.
2206          */
2207         lrecs = xfs_btree_get_numrecs(left);
2208         if (cur->bc_ptrs[level] >= lrecs)
2209                 goto out0;
2210
2211         /* Set up the right neighbor as "right". */
2212         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2213         if (error)
2214                 goto error0;
2215
2216         /* If it's full, it can't take another entry. */
2217         rrecs = xfs_btree_get_numrecs(right);
2218         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2219                 goto out0;
2220
2221         XFS_BTREE_STATS_INC(cur, rshift);
2222         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2223
2224         /*
2225          * Make a hole at the start of the right neighbor block, then
2226          * copy the last left block entry to the hole.
2227          */
2228         if (level > 0) {
2229                 /* It's a nonleaf. make a hole in the keys and ptrs */
2230                 union xfs_btree_key     *lkp;
2231                 union xfs_btree_ptr     *lpp;
2232                 union xfs_btree_ptr     *rpp;
2233
2234                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2235                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2236                 rkp = xfs_btree_key_addr(cur, 1, right);
2237                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2238
2239 #ifdef DEBUG
2240                 for (i = rrecs - 1; i >= 0; i--) {
2241                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2242                         if (error)
2243                                 goto error0;
2244                 }
2245 #endif
2246
2247                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2248                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2249
2250 #ifdef DEBUG
2251                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2252                 if (error)
2253                         goto error0;
2254 #endif
2255
2256                 /* Now put the new data in, and log it. */
2257                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2258                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2259
2260                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2261                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2262
2263                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2264                         xfs_btree_key_addr(cur, 2, right)));
2265         } else {
2266                 /* It's a leaf. make a hole in the records */
2267                 union xfs_btree_rec     *lrp;
2268                 union xfs_btree_rec     *rrp;
2269
2270                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2271                 rrp = xfs_btree_rec_addr(cur, 1, right);
2272
2273                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2274
2275                 /* Now put the new data in, and log it. */
2276                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2277                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2278
2279                 cur->bc_ops->init_key_from_rec(&key, rrp);
2280                 rkp = &key;
2281
2282                 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2283                         xfs_btree_rec_addr(cur, 2, right)));
2284         }
2285
2286         /*
2287          * Decrement and log left's numrecs, bump and log right's numrecs.
2288          */
2289         xfs_btree_set_numrecs(left, --lrecs);
2290         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2291
2292         xfs_btree_set_numrecs(right, ++rrecs);
2293         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2294
2295         /*
2296          * Using a temporary cursor, update the parent key values of the
2297          * block on the right.
2298          */
2299         error = xfs_btree_dup_cursor(cur, &tcur);
2300         if (error)
2301                 goto error0;
2302         i = xfs_btree_lastrec(tcur, level);
2303         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2304
2305         error = xfs_btree_increment(tcur, level, &i);
2306         if (error)
2307                 goto error1;
2308
2309         error = xfs_btree_updkey(tcur, rkp, level + 1);
2310         if (error)
2311                 goto error1;
2312
2313         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2314
2315         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2316         *stat = 1;
2317         return 0;
2318
2319 out0:
2320         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2321         *stat = 0;
2322         return 0;
2323
2324 error0:
2325         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2326         return error;
2327
2328 error1:
2329         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2330         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2331         return error;
2332 }
2333
2334 /*
2335  * Split cur/level block in half.
2336  * Return new block number and the key to its first
2337  * record (to be inserted into parent).
2338  */
2339 STATIC int                                      /* error */
2340 __xfs_btree_split(
2341         struct xfs_btree_cur    *cur,
2342         int                     level,
2343         union xfs_btree_ptr     *ptrp,
2344         union xfs_btree_key     *key,
2345         struct xfs_btree_cur    **curp,
2346         int                     *stat)          /* success/failure */
2347 {
2348         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2349         struct xfs_buf          *lbp;           /* left buffer pointer */
2350         struct xfs_btree_block  *left;          /* left btree block */
2351         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2352         struct xfs_buf          *rbp;           /* right buffer pointer */
2353         struct xfs_btree_block  *right;         /* right btree block */
2354         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2355         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2356         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2357         int                     lrecs;
2358         int                     rrecs;
2359         int                     src_index;
2360         int                     error;          /* error return value */
2361 #ifdef DEBUG
2362         int                     i;
2363 #endif
2364
2365         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2366         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2367
2368         XFS_BTREE_STATS_INC(cur, split);
2369
2370         /* Set up left block (current one). */
2371         left = xfs_btree_get_block(cur, level, &lbp);
2372
2373 #ifdef DEBUG
2374         error = xfs_btree_check_block(cur, left, level, lbp);
2375         if (error)
2376                 goto error0;
2377 #endif
2378
2379         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2380
2381         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2382         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2383         if (error)
2384                 goto error0;
2385         if (*stat == 0)
2386                 goto out0;
2387         XFS_BTREE_STATS_INC(cur, alloc);
2388
2389         /* Set up the new block as "right". */
2390         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2391         if (error)
2392                 goto error0;
2393
2394         /* Fill in the btree header for the new right block. */
2395         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2396
2397         /*
2398          * Split the entries between the old and the new block evenly.
2399          * Make sure that if there's an odd number of entries now, that
2400          * each new block will have the same number of entries.
2401          */
2402         lrecs = xfs_btree_get_numrecs(left);
2403         rrecs = lrecs / 2;
2404         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2405                 rrecs++;
2406         src_index = (lrecs - rrecs + 1);
2407
2408         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2409
2410         /*
2411          * Copy btree block entries from the left block over to the
2412          * new block, the right. Update the right block and log the
2413          * changes.
2414          */
2415         if (level > 0) {
2416                 /* It's a non-leaf.  Move keys and pointers. */
2417                 union xfs_btree_key     *lkp;   /* left btree key */
2418                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2419                 union xfs_btree_key     *rkp;   /* right btree key */
2420                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2421
2422                 lkp = xfs_btree_key_addr(cur, src_index, left);
2423                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2424                 rkp = xfs_btree_key_addr(cur, 1, right);
2425                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2426
2427 #ifdef DEBUG
2428                 for (i = src_index; i < rrecs; i++) {
2429                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2430                         if (error)
2431                                 goto error0;
2432                 }
2433 #endif
2434
2435                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2436                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2437
2438                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2439                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2440
2441                 /* Grab the keys to the entries moved to the right block */
2442                 xfs_btree_copy_keys(cur, key, rkp, 1);
2443         } else {
2444                 /* It's a leaf.  Move records.  */
2445                 union xfs_btree_rec     *lrp;   /* left record pointer */
2446                 union xfs_btree_rec     *rrp;   /* right record pointer */
2447
2448                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2449                 rrp = xfs_btree_rec_addr(cur, 1, right);
2450
2451                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2452                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2453
2454                 cur->bc_ops->init_key_from_rec(key,
2455                         xfs_btree_rec_addr(cur, 1, right));
2456         }
2457
2458
2459         /*
2460          * Find the left block number by looking in the buffer.
2461          * Adjust numrecs, sibling pointers.
2462          */
2463         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2464         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2465         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2466         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2467
2468         lrecs -= rrecs;
2469         xfs_btree_set_numrecs(left, lrecs);
2470         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2471
2472         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2473         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2474
2475         /*
2476          * If there's a block to the new block's right, make that block
2477          * point back to right instead of to left.
2478          */
2479         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2480                 error = xfs_btree_read_buf_block(cur, &rrptr,
2481                                                         0, &rrblock, &rrbp);
2482                 if (error)
2483                         goto error0;
2484                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2485                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2486         }
2487         /*
2488          * If the cursor is really in the right block, move it there.
2489          * If it's just pointing past the last entry in left, then we'll
2490          * insert there, so don't change anything in that case.
2491          */
2492         if (cur->bc_ptrs[level] > lrecs + 1) {
2493                 xfs_btree_setbuf(cur, level, rbp);
2494                 cur->bc_ptrs[level] -= lrecs;
2495         }
2496         /*
2497          * If there are more levels, we'll need another cursor which refers
2498          * the right block, no matter where this cursor was.
2499          */
2500         if (level + 1 < cur->bc_nlevels) {
2501                 error = xfs_btree_dup_cursor(cur, curp);
2502                 if (error)
2503                         goto error0;
2504                 (*curp)->bc_ptrs[level + 1]++;
2505         }
2506         *ptrp = rptr;
2507         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2508         *stat = 1;
2509         return 0;
2510 out0:
2511         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2512         *stat = 0;
2513         return 0;
2514
2515 error0:
2516         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2517         return error;
2518 }
2519
2520 struct xfs_btree_split_args {
2521         struct xfs_btree_cur    *cur;
2522         int                     level;
2523         union xfs_btree_ptr     *ptrp;
2524         union xfs_btree_key     *key;
2525         struct xfs_btree_cur    **curp;
2526         int                     *stat;          /* success/failure */
2527         int                     result;
2528         bool                    kswapd; /* allocation in kswapd context */
2529         struct completion       *done;
2530         struct work_struct      work;
2531 };
2532
2533 /*
2534  * Stack switching interfaces for allocation
2535  */
2536 static void
2537 xfs_btree_split_worker(
2538         struct work_struct      *work)
2539 {
2540         struct xfs_btree_split_args     *args = container_of(work,
2541                                                 struct xfs_btree_split_args, work);
2542         unsigned long           pflags;
2543         unsigned long           new_pflags = PF_FSTRANS;
2544
2545         /*
2546          * we are in a transaction context here, but may also be doing work
2547          * in kswapd context, and hence we may need to inherit that state
2548          * temporarily to ensure that we don't block waiting for memory reclaim
2549          * in any way.
2550          */
2551         if (args->kswapd)
2552                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2553
2554         current_set_flags_nested(&pflags, new_pflags);
2555
2556         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2557                                          args->key, args->curp, args->stat);
2558         complete(args->done);
2559
2560         current_restore_flags_nested(&pflags, new_pflags);
2561 }
2562
2563 /*
2564  * BMBT split requests often come in with little stack to work on. Push
2565  * them off to a worker thread so there is lots of stack to use. For the other
2566  * btree types, just call directly to avoid the context switch overhead here.
2567  */
2568 STATIC int                                      /* error */
2569 xfs_btree_split(
2570         struct xfs_btree_cur    *cur,
2571         int                     level,
2572         union xfs_btree_ptr     *ptrp,
2573         union xfs_btree_key     *key,
2574         struct xfs_btree_cur    **curp,
2575         int                     *stat)          /* success/failure */
2576 {
2577         struct xfs_btree_split_args     args;
2578         DECLARE_COMPLETION_ONSTACK(done);
2579
2580         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2581                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2582
2583         args.cur = cur;
2584         args.level = level;
2585         args.ptrp = ptrp;
2586         args.key = key;
2587         args.curp = curp;
2588         args.stat = stat;
2589         args.done = &done;
2590         args.kswapd = current_is_kswapd();
2591         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2592         queue_work(xfs_alloc_wq, &args.work);
2593         wait_for_completion(&done);
2594         destroy_work_on_stack(&args.work);
2595         return args.result;
2596 }
2597
2598
2599 /*
2600  * Copy the old inode root contents into a real block and make the
2601  * broot point to it.
2602  */
2603 int                                             /* error */
2604 xfs_btree_new_iroot(
2605         struct xfs_btree_cur    *cur,           /* btree cursor */
2606         int                     *logflags,      /* logging flags for inode */
2607         int                     *stat)          /* return status - 0 fail */
2608 {
2609         struct xfs_buf          *cbp;           /* buffer for cblock */
2610         struct xfs_btree_block  *block;         /* btree block */
2611         struct xfs_btree_block  *cblock;        /* child btree block */
2612         union xfs_btree_key     *ckp;           /* child key pointer */
2613         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2614         union xfs_btree_key     *kp;            /* pointer to btree key */
2615         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2616         union xfs_btree_ptr     nptr;           /* new block addr */
2617         int                     level;          /* btree level */
2618         int                     error;          /* error return code */
2619 #ifdef DEBUG
2620         int                     i;              /* loop counter */
2621 #endif
2622
2623         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2624         XFS_BTREE_STATS_INC(cur, newroot);
2625
2626         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2627
2628         level = cur->bc_nlevels - 1;
2629
2630         block = xfs_btree_get_iroot(cur);
2631         pp = xfs_btree_ptr_addr(cur, 1, block);
2632
2633         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2634         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2635         if (error)
2636                 goto error0;
2637         if (*stat == 0) {
2638                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2639                 return 0;
2640         }
2641         XFS_BTREE_STATS_INC(cur, alloc);
2642
2643         /* Copy the root into a real block. */
2644         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2645         if (error)
2646                 goto error0;
2647
2648         /*
2649          * we can't just memcpy() the root in for CRC enabled btree blocks.
2650          * In that case have to also ensure the blkno remains correct
2651          */
2652         memcpy(cblock, block, xfs_btree_block_len(cur));
2653         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2654                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2655                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2656                 else
2657                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2658         }
2659
2660         be16_add_cpu(&block->bb_level, 1);
2661         xfs_btree_set_numrecs(block, 1);
2662         cur->bc_nlevels++;
2663         cur->bc_ptrs[level + 1] = 1;
2664
2665         kp = xfs_btree_key_addr(cur, 1, block);
2666         ckp = xfs_btree_key_addr(cur, 1, cblock);
2667         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2668
2669         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2670 #ifdef DEBUG
2671         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2672                 error = xfs_btree_check_ptr(cur, pp, i, level);
2673                 if (error)
2674                         goto error0;
2675         }
2676 #endif
2677         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2678
2679 #ifdef DEBUG
2680         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2681         if (error)
2682                 goto error0;
2683 #endif
2684         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2685
2686         xfs_iroot_realloc(cur->bc_private.b.ip,
2687                           1 - xfs_btree_get_numrecs(cblock),
2688                           cur->bc_private.b.whichfork);
2689
2690         xfs_btree_setbuf(cur, level, cbp);
2691
2692         /*
2693          * Do all this logging at the end so that
2694          * the root is at the right level.
2695          */
2696         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2697         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2698         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2699
2700         *logflags |=
2701                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2702         *stat = 1;
2703         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2704         return 0;
2705 error0:
2706         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2707         return error;
2708 }
2709
2710 /*
2711  * Allocate a new root block, fill it in.
2712  */
2713 STATIC int                              /* error */
2714 xfs_btree_new_root(
2715         struct xfs_btree_cur    *cur,   /* btree cursor */
2716         int                     *stat)  /* success/failure */
2717 {
2718         struct xfs_btree_block  *block; /* one half of the old root block */
2719         struct xfs_buf          *bp;    /* buffer containing block */
2720         int                     error;  /* error return value */
2721         struct xfs_buf          *lbp;   /* left buffer pointer */
2722         struct xfs_btree_block  *left;  /* left btree block */
2723         struct xfs_buf          *nbp;   /* new (root) buffer */
2724         struct xfs_btree_block  *new;   /* new (root) btree block */
2725         int                     nptr;   /* new value for key index, 1 or 2 */
2726         struct xfs_buf          *rbp;   /* right buffer pointer */
2727         struct xfs_btree_block  *right; /* right btree block */
2728         union xfs_btree_ptr     rptr;
2729         union xfs_btree_ptr     lptr;
2730
2731         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2732         XFS_BTREE_STATS_INC(cur, newroot);
2733
2734         /* initialise our start point from the cursor */
2735         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2736
2737         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2738         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2739         if (error)
2740                 goto error0;
2741         if (*stat == 0)
2742                 goto out0;
2743         XFS_BTREE_STATS_INC(cur, alloc);
2744
2745         /* Set up the new block. */
2746         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2747         if (error)
2748                 goto error0;
2749
2750         /* Set the root in the holding structure  increasing the level by 1. */
2751         cur->bc_ops->set_root(cur, &lptr, 1);
2752
2753         /*
2754          * At the previous root level there are now two blocks: the old root,
2755          * and the new block generated when it was split.  We don't know which
2756          * one the cursor is pointing at, so we set up variables "left" and
2757          * "right" for each case.
2758          */
2759         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2760
2761 #ifdef DEBUG
2762         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2763         if (error)
2764                 goto error0;
2765 #endif
2766
2767         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2768         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2769                 /* Our block is left, pick up the right block. */
2770                 lbp = bp;
2771                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2772                 left = block;
2773                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2774                 if (error)
2775                         goto error0;
2776                 bp = rbp;
2777                 nptr = 1;
2778         } else {
2779                 /* Our block is right, pick up the left block. */
2780                 rbp = bp;
2781                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2782                 right = block;
2783                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2784                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2785                 if (error)
2786                         goto error0;
2787                 bp = lbp;
2788                 nptr = 2;
2789         }
2790         /* Fill in the new block's btree header and log it. */
2791         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2792         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2793         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2794                         !xfs_btree_ptr_is_null(cur, &rptr));
2795
2796         /* Fill in the key data in the new root. */
2797         if (xfs_btree_get_level(left) > 0) {
2798                 xfs_btree_copy_keys(cur,
2799                                 xfs_btree_key_addr(cur, 1, new),
2800                                 xfs_btree_key_addr(cur, 1, left), 1);
2801                 xfs_btree_copy_keys(cur,
2802                                 xfs_btree_key_addr(cur, 2, new),
2803                                 xfs_btree_key_addr(cur, 1, right), 1);
2804         } else {
2805                 cur->bc_ops->init_key_from_rec(
2806                                 xfs_btree_key_addr(cur, 1, new),
2807                                 xfs_btree_rec_addr(cur, 1, left));
2808                 cur->bc_ops->init_key_from_rec(
2809                                 xfs_btree_key_addr(cur, 2, new),
2810                                 xfs_btree_rec_addr(cur, 1, right));
2811         }
2812         xfs_btree_log_keys(cur, nbp, 1, 2);
2813
2814         /* Fill in the pointer data in the new root. */
2815         xfs_btree_copy_ptrs(cur,
2816                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2817         xfs_btree_copy_ptrs(cur,
2818                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2819         xfs_btree_log_ptrs(cur, nbp, 1, 2);
2820
2821         /* Fix up the cursor. */
2822         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2823         cur->bc_ptrs[cur->bc_nlevels] = nptr;
2824         cur->bc_nlevels++;
2825         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2826         *stat = 1;
2827         return 0;
2828 error0:
2829         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2830         return error;
2831 out0:
2832         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2833         *stat = 0;
2834         return 0;
2835 }
2836
2837 STATIC int
2838 xfs_btree_make_block_unfull(
2839         struct xfs_btree_cur    *cur,   /* btree cursor */
2840         int                     level,  /* btree level */
2841         int                     numrecs,/* # of recs in block */
2842         int                     *oindex,/* old tree index */
2843         int                     *index, /* new tree index */
2844         union xfs_btree_ptr     *nptr,  /* new btree ptr */
2845         struct xfs_btree_cur    **ncur, /* new btree cursor */
2846         union xfs_btree_rec     *nrec,  /* new record */
2847         int                     *stat)
2848 {
2849         union xfs_btree_key     key;    /* new btree key value */
2850         int                     error = 0;
2851
2852         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2853             level == cur->bc_nlevels - 1) {
2854                 struct xfs_inode *ip = cur->bc_private.b.ip;
2855
2856                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2857                         /* A root block that can be made bigger. */
2858                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2859                 } else {
2860                         /* A root block that needs replacing */
2861                         int     logflags = 0;
2862
2863                         error = xfs_btree_new_iroot(cur, &logflags, stat);
2864                         if (error || *stat == 0)
2865                                 return error;
2866
2867                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2868                 }
2869
2870                 return 0;
2871         }
2872
2873         /* First, try shifting an entry to the right neighbor. */
2874         error = xfs_btree_rshift(cur, level, stat);
2875         if (error || *stat)
2876                 return error;
2877
2878         /* Next, try shifting an entry to the left neighbor. */
2879         error = xfs_btree_lshift(cur, level, stat);
2880         if (error)
2881                 return error;
2882
2883         if (*stat) {
2884                 *oindex = *index = cur->bc_ptrs[level];
2885                 return 0;
2886         }
2887
2888         /*
2889          * Next, try splitting the current block in half.
2890          *
2891          * If this works we have to re-set our variables because we
2892          * could be in a different block now.
2893          */
2894         error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2895         if (error || *stat == 0)
2896                 return error;
2897
2898
2899         *index = cur->bc_ptrs[level];
2900         cur->bc_ops->init_rec_from_key(&key, nrec);
2901         return 0;
2902 }
2903
2904 /*
2905  * Insert one record/level.  Return information to the caller
2906  * allowing the next level up to proceed if necessary.
2907  */
2908 STATIC int
2909 xfs_btree_insrec(
2910         struct xfs_btree_cur    *cur,   /* btree cursor */
2911         int                     level,  /* level to insert record at */
2912         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2913         union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2914         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2915         int                     *stat)  /* success/failure */
2916 {
2917         struct xfs_btree_block  *block; /* btree block */
2918         struct xfs_buf          *bp;    /* buffer for block */
2919         union xfs_btree_key     key;    /* btree key */
2920         union xfs_btree_ptr     nptr;   /* new block ptr */
2921         struct xfs_btree_cur    *ncur;  /* new btree cursor */
2922         union xfs_btree_rec     nrec;   /* new record count */
2923         int                     optr;   /* old key/record index */
2924         int                     ptr;    /* key/record index */
2925         int                     numrecs;/* number of records */
2926         int                     error;  /* error return value */
2927 #ifdef DEBUG
2928         int                     i;
2929 #endif
2930
2931         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2932         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2933
2934         ncur = NULL;
2935
2936         /*
2937          * If we have an external root pointer, and we've made it to the
2938          * root level, allocate a new root block and we're done.
2939          */
2940         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2941             (level >= cur->bc_nlevels)) {
2942                 error = xfs_btree_new_root(cur, stat);
2943                 xfs_btree_set_ptr_null(cur, ptrp);
2944
2945                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2946                 return error;
2947         }
2948
2949         /* If we're off the left edge, return failure. */
2950         ptr = cur->bc_ptrs[level];
2951         if (ptr == 0) {
2952                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2953                 *stat = 0;
2954                 return 0;
2955         }
2956
2957         /* Make a key out of the record data to be inserted, and save it. */
2958         cur->bc_ops->init_key_from_rec(&key, recp);
2959
2960         optr = ptr;
2961
2962         XFS_BTREE_STATS_INC(cur, insrec);
2963
2964         /* Get pointers to the btree buffer and block. */
2965         block = xfs_btree_get_block(cur, level, &bp);
2966         numrecs = xfs_btree_get_numrecs(block);
2967
2968 #ifdef DEBUG
2969         error = xfs_btree_check_block(cur, block, level, bp);
2970         if (error)
2971                 goto error0;
2972
2973         /* Check that the new entry is being inserted in the right place. */
2974         if (ptr <= numrecs) {
2975                 if (level == 0) {
2976                         ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2977                                 xfs_btree_rec_addr(cur, ptr, block)));
2978                 } else {
2979                         ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2980                                 xfs_btree_key_addr(cur, ptr, block)));
2981                 }
2982         }
2983 #endif
2984
2985         /*
2986          * If the block is full, we can't insert the new entry until we
2987          * make the block un-full.
2988          */
2989         xfs_btree_set_ptr_null(cur, &nptr);
2990         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2991                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2992                                         &optr, &ptr, &nptr, &ncur, &nrec, stat);
2993                 if (error || *stat == 0)
2994                         goto error0;
2995         }
2996
2997         /*
2998          * The current block may have changed if the block was
2999          * previously full and we have just made space in it.
3000          */
3001         block = xfs_btree_get_block(cur, level, &bp);
3002         numrecs = xfs_btree_get_numrecs(block);
3003
3004 #ifdef DEBUG
3005         error = xfs_btree_check_block(cur, block, level, bp);
3006         if (error)
3007                 return error;
3008 #endif
3009
3010         /*
3011          * At this point we know there's room for our new entry in the block
3012          * we're pointing at.
3013          */
3014         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3015
3016         if (level > 0) {
3017                 /* It's a nonleaf. make a hole in the keys and ptrs */
3018                 union xfs_btree_key     *kp;
3019                 union xfs_btree_ptr     *pp;
3020
3021                 kp = xfs_btree_key_addr(cur, ptr, block);
3022                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3023
3024 #ifdef DEBUG
3025                 for (i = numrecs - ptr; i >= 0; i--) {
3026                         error = xfs_btree_check_ptr(cur, pp, i, level);
3027                         if (error)
3028                                 return error;
3029                 }
3030 #endif
3031
3032                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3033                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3034
3035 #ifdef DEBUG
3036                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3037                 if (error)
3038                         goto error0;
3039 #endif
3040
3041                 /* Now put the new data in, bump numrecs and log it. */
3042                 xfs_btree_copy_keys(cur, kp, &key, 1);
3043                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3044                 numrecs++;
3045                 xfs_btree_set_numrecs(block, numrecs);
3046                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3047                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3048 #ifdef DEBUG
3049                 if (ptr < numrecs) {
3050                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3051                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3052                 }
3053 #endif
3054         } else {
3055                 /* It's a leaf. make a hole in the records */
3056                 union xfs_btree_rec             *rp;
3057
3058                 rp = xfs_btree_rec_addr(cur, ptr, block);
3059
3060                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3061
3062                 /* Now put the new data in, bump numrecs and log it. */
3063                 xfs_btree_copy_recs(cur, rp, recp, 1);
3064                 xfs_btree_set_numrecs(block, ++numrecs);
3065                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3066 #ifdef DEBUG
3067                 if (ptr < numrecs) {
3068                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3069                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3070                 }
3071 #endif
3072         }
3073
3074         /* Log the new number of records in the btree header. */
3075         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3076
3077         /* If we inserted at the start of a block, update the parents' keys. */
3078         if (optr == 1) {
3079                 error = xfs_btree_updkey(cur, &key, level + 1);
3080                 if (error)
3081                         goto error0;
3082         }
3083
3084         /*
3085          * If we are tracking the last record in the tree and
3086          * we are at the far right edge of the tree, update it.
3087          */
3088         if (xfs_btree_is_lastrec(cur, block, level)) {
3089                 cur->bc_ops->update_lastrec(cur, block, recp,
3090                                             ptr, LASTREC_INSREC);
3091         }
3092
3093         /*
3094          * Return the new block number, if any.
3095          * If there is one, give back a record value and a cursor too.
3096          */
3097         *ptrp = nptr;
3098         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3099                 *recp = nrec;
3100                 *curp = ncur;
3101         }
3102
3103         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3104         *stat = 1;
3105         return 0;
3106
3107 error0:
3108         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3109         return error;
3110 }
3111
3112 /*
3113  * Insert the record at the point referenced by cur.
3114  *
3115  * A multi-level split of the tree on insert will invalidate the original
3116  * cursor.  All callers of this function should assume that the cursor is
3117  * no longer valid and revalidate it.
3118  */
3119 int
3120 xfs_btree_insert(
3121         struct xfs_btree_cur    *cur,
3122         int                     *stat)
3123 {
3124         int                     error;  /* error return value */
3125         int                     i;      /* result value, 0 for failure */
3126         int                     level;  /* current level number in btree */
3127         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3128         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3129         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3130         union xfs_btree_rec     rec;    /* record to insert */
3131
3132         level = 0;
3133         ncur = NULL;
3134         pcur = cur;
3135
3136         xfs_btree_set_ptr_null(cur, &nptr);
3137         cur->bc_ops->init_rec_from_cur(cur, &rec);
3138
3139         /*
3140          * Loop going up the tree, starting at the leaf level.
3141          * Stop when we don't get a split block, that must mean that
3142          * the insert is finished with this level.
3143          */
3144         do {
3145                 /*
3146                  * Insert nrec/nptr into this level of the tree.
3147                  * Note if we fail, nptr will be null.
3148                  */
3149                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3150                 if (error) {
3151                         if (pcur != cur)
3152                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3153                         goto error0;
3154                 }
3155
3156                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3157                 level++;
3158
3159                 /*
3160                  * See if the cursor we just used is trash.
3161                  * Can't trash the caller's cursor, but otherwise we should
3162                  * if ncur is a new cursor or we're about to be done.
3163                  */
3164                 if (pcur != cur &&
3165                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3166                         /* Save the state from the cursor before we trash it */
3167                         if (cur->bc_ops->update_cursor)
3168                                 cur->bc_ops->update_cursor(pcur, cur);
3169                         cur->bc_nlevels = pcur->bc_nlevels;
3170                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3171                 }
3172                 /* If we got a new cursor, switch to it. */
3173                 if (ncur) {
3174                         pcur = ncur;
3175                         ncur = NULL;
3176                 }
3177         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3178
3179         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3180         *stat = i;
3181         return 0;
3182 error0:
3183         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3184         return error;
3185 }
3186
3187 /*
3188  * Try to merge a non-leaf block back into the inode root.
3189  *
3190  * Note: the killroot names comes from the fact that we're effectively
3191  * killing the old root block.  But because we can't just delete the
3192  * inode we have to copy the single block it was pointing to into the
3193  * inode.
3194  */
3195 STATIC int
3196 xfs_btree_kill_iroot(
3197         struct xfs_btree_cur    *cur)
3198 {
3199         int                     whichfork = cur->bc_private.b.whichfork;
3200         struct xfs_inode        *ip = cur->bc_private.b.ip;
3201         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3202         struct xfs_btree_block  *block;
3203         struct xfs_btree_block  *cblock;
3204         union xfs_btree_key     *kp;
3205         union xfs_btree_key     *ckp;
3206         union xfs_btree_ptr     *pp;
3207         union xfs_btree_ptr     *cpp;
3208         struct xfs_buf          *cbp;
3209         int                     level;
3210         int                     index;
3211         int                     numrecs;
3212 #ifdef DEBUG
3213         union xfs_btree_ptr     ptr;
3214         int                     i;
3215 #endif
3216
3217         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3218
3219         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3220         ASSERT(cur->bc_nlevels > 1);
3221
3222         /*
3223          * Don't deal with the root block needs to be a leaf case.
3224          * We're just going to turn the thing back into extents anyway.
3225          */
3226         level = cur->bc_nlevels - 1;
3227         if (level == 1)
3228                 goto out0;
3229
3230         /*
3231          * Give up if the root has multiple children.
3232          */
3233         block = xfs_btree_get_iroot(cur);
3234         if (xfs_btree_get_numrecs(block) != 1)
3235                 goto out0;
3236
3237         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3238         numrecs = xfs_btree_get_numrecs(cblock);
3239
3240         /*
3241          * Only do this if the next level will fit.
3242          * Then the data must be copied up to the inode,
3243          * instead of freeing the root you free the next level.
3244          */
3245         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3246                 goto out0;
3247
3248         XFS_BTREE_STATS_INC(cur, killroot);
3249
3250 #ifdef DEBUG
3251         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3252         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3253         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3254         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3255 #endif
3256
3257         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3258         if (index) {
3259                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3260                                   cur->bc_private.b.whichfork);
3261                 block = ifp->if_broot;
3262         }
3263
3264         be16_add_cpu(&block->bb_numrecs, index);
3265         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3266
3267         kp = xfs_btree_key_addr(cur, 1, block);
3268         ckp = xfs_btree_key_addr(cur, 1, cblock);
3269         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3270
3271         pp = xfs_btree_ptr_addr(cur, 1, block);
3272         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3273 #ifdef DEBUG
3274         for (i = 0; i < numrecs; i++) {
3275                 int             error;
3276
3277                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3278                 if (error) {
3279                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3280                         return error;
3281                 }
3282         }
3283 #endif
3284         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3285
3286         cur->bc_ops->free_block(cur, cbp);
3287         XFS_BTREE_STATS_INC(cur, free);
3288
3289         cur->bc_bufs[level - 1] = NULL;
3290         be16_add_cpu(&block->bb_level, -1);
3291         xfs_trans_log_inode(cur->bc_tp, ip,
3292                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3293         cur->bc_nlevels--;
3294 out0:
3295         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3296         return 0;
3297 }
3298
3299 /*
3300  * Kill the current root node, and replace it with it's only child node.
3301  */
3302 STATIC int
3303 xfs_btree_kill_root(
3304         struct xfs_btree_cur    *cur,
3305         struct xfs_buf          *bp,
3306         int                     level,
3307         union xfs_btree_ptr     *newroot)
3308 {
3309         int                     error;
3310
3311         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3312         XFS_BTREE_STATS_INC(cur, killroot);
3313
3314         /*
3315          * Update the root pointer, decreasing the level by 1 and then
3316          * free the old root.
3317          */
3318         cur->bc_ops->set_root(cur, newroot, -1);
3319
3320         error = cur->bc_ops->free_block(cur, bp);
3321         if (error) {
3322                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3323                 return error;
3324         }
3325
3326         XFS_BTREE_STATS_INC(cur, free);
3327
3328         cur->bc_bufs[level] = NULL;
3329         cur->bc_ra[level] = 0;
3330         cur->bc_nlevels--;
3331
3332         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3333         return 0;
3334 }
3335
3336 STATIC int
3337 xfs_btree_dec_cursor(
3338         struct xfs_btree_cur    *cur,
3339         int                     level,
3340         int                     *stat)
3341 {
3342         int                     error;
3343         int                     i;
3344
3345         if (level > 0) {
3346                 error = xfs_btree_decrement(cur, level, &i);
3347                 if (error)
3348                         return error;
3349         }
3350
3351         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3352         *stat = 1;
3353         return 0;
3354 }
3355
3356 /*
3357  * Single level of the btree record deletion routine.
3358  * Delete record pointed to by cur/level.
3359  * Remove the record from its block then rebalance the tree.
3360  * Return 0 for error, 1 for done, 2 to go on to the next level.
3361  */
3362 STATIC int                                      /* error */
3363 xfs_btree_delrec(
3364         struct xfs_btree_cur    *cur,           /* btree cursor */
3365         int                     level,          /* level removing record from */
3366         int                     *stat)          /* fail/done/go-on */
3367 {
3368         struct xfs_btree_block  *block;         /* btree block */
3369         union xfs_btree_ptr     cptr;           /* current block ptr */
3370         struct xfs_buf          *bp;            /* buffer for block */
3371         int                     error;          /* error return value */
3372         int                     i;              /* loop counter */
3373         union xfs_btree_key     key;            /* storage for keyp */
3374         union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3375         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3376         struct xfs_buf          *lbp;           /* left buffer pointer */
3377         struct xfs_btree_block  *left;          /* left btree block */
3378         int                     lrecs = 0;      /* left record count */
3379         int                     ptr;            /* key/record index */
3380         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3381         struct xfs_buf          *rbp;           /* right buffer pointer */
3382         struct xfs_btree_block  *right;         /* right btree block */
3383         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3384         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3385         int                     rrecs = 0;      /* right record count */
3386         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3387         int                     numrecs;        /* temporary numrec count */
3388
3389         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3390         XFS_BTREE_TRACE_ARGI(cur, level);
3391
3392         tcur = NULL;
3393
3394         /* Get the index of the entry being deleted, check for nothing there. */
3395         ptr = cur->bc_ptrs[level];
3396         if (ptr == 0) {
3397                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3398                 *stat = 0;
3399                 return 0;
3400         }
3401
3402         /* Get the buffer & block containing the record or key/ptr. */
3403         block = xfs_btree_get_block(cur, level, &bp);
3404         numrecs = xfs_btree_get_numrecs(block);
3405
3406 #ifdef DEBUG
3407         error = xfs_btree_check_block(cur, block, level, bp);
3408         if (error)
3409                 goto error0;
3410 #endif
3411
3412         /* Fail if we're off the end of the block. */
3413         if (ptr > numrecs) {
3414                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3415                 *stat = 0;
3416                 return 0;
3417         }
3418
3419         XFS_BTREE_STATS_INC(cur, delrec);
3420         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3421
3422         /* Excise the entries being deleted. */
3423         if (level > 0) {
3424                 /* It's a nonleaf. operate on keys and ptrs */
3425                 union xfs_btree_key     *lkp;
3426                 union xfs_btree_ptr     *lpp;
3427
3428                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3429                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3430
3431 #ifdef DEBUG
3432                 for (i = 0; i < numrecs - ptr; i++) {
3433                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3434                         if (error)
3435                                 goto error0;
3436                 }
3437 #endif
3438
3439                 if (ptr < numrecs) {
3440                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3441                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3442                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3443                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3444                 }
3445
3446                 /*
3447                  * If it's the first record in the block, we'll need to pass a
3448                  * key up to the next level (updkey).
3449                  */
3450                 if (ptr == 1)
3451                         keyp = xfs_btree_key_addr(cur, 1, block);
3452         } else {
3453                 /* It's a leaf. operate on records */
3454                 if (ptr < numrecs) {
3455                         xfs_btree_shift_recs(cur,
3456                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3457                                 -1, numrecs - ptr);
3458                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3459                 }
3460
3461                 /*
3462                  * If it's the first record in the block, we'll need a key
3463                  * structure to pass up to the next level (updkey).
3464                  */
3465                 if (ptr == 1) {
3466                         cur->bc_ops->init_key_from_rec(&key,
3467                                         xfs_btree_rec_addr(cur, 1, block));
3468                         keyp = &key;
3469                 }
3470         }
3471
3472         /*
3473          * Decrement and log the number of entries in the block.
3474          */
3475         xfs_btree_set_numrecs(block, --numrecs);
3476         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3477
3478         /*
3479          * If we are tracking the last record in the tree and
3480          * we are at the far right edge of the tree, update it.
3481          */
3482         if (xfs_btree_is_lastrec(cur, block, level)) {
3483                 cur->bc_ops->update_lastrec(cur, block, NULL,
3484                                             ptr, LASTREC_DELREC);
3485         }
3486
3487         /*
3488          * We're at the root level.  First, shrink the root block in-memory.
3489          * Try to get rid of the next level down.  If we can't then there's
3490          * nothing left to do.
3491          */
3492         if (level == cur->bc_nlevels - 1) {
3493                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3494                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3495                                           cur->bc_private.b.whichfork);
3496
3497                         error = xfs_btree_kill_iroot(cur);
3498                         if (error)
3499                                 goto error0;
3500
3501                         error = xfs_btree_dec_cursor(cur, level, stat);
3502                         if (error)
3503                                 goto error0;
3504                         *stat = 1;
3505                         return 0;
3506                 }
3507
3508                 /*
3509                  * If this is the root level, and there's only one entry left,
3510                  * and it's NOT the leaf level, then we can get rid of this
3511                  * level.
3512                  */
3513                 if (numrecs == 1 && level > 0) {
3514                         union xfs_btree_ptr     *pp;
3515                         /*
3516                          * pp is still set to the first pointer in the block.
3517                          * Make it the new root of the btree.
3518                          */
3519                         pp = xfs_btree_ptr_addr(cur, 1, block);
3520                         error = xfs_btree_kill_root(cur, bp, level, pp);
3521                         if (error)
3522                                 goto error0;
3523                 } else if (level > 0) {
3524                         error = xfs_btree_dec_cursor(cur, level, stat);
3525                         if (error)
3526                                 goto error0;
3527                 }
3528                 *stat = 1;
3529                 return 0;
3530         }
3531
3532         /*
3533          * If we deleted the leftmost entry in the block, update the
3534          * key values above us in the tree.
3535          */
3536         if (ptr == 1) {
3537                 error = xfs_btree_updkey(cur, keyp, level + 1);
3538                 if (error)
3539                         goto error0;
3540         }
3541
3542         /*
3543          * If the number of records remaining in the block is at least
3544          * the minimum, we're done.
3545          */
3546         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3547                 error = xfs_btree_dec_cursor(cur, level, stat);
3548                 if (error)
3549                         goto error0;
3550                 return 0;
3551         }
3552
3553         /*
3554          * Otherwise, we have to move some records around to keep the
3555          * tree balanced.  Look at the left and right sibling blocks to
3556          * see if we can re-balance by moving only one record.
3557          */
3558         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3559         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3560
3561         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3562                 /*
3563                  * One child of root, need to get a chance to copy its contents
3564                  * into the root and delete it. Can't go up to next level,
3565                  * there's nothing to delete there.
3566                  */
3567                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3568                     xfs_btree_ptr_is_null(cur, &lptr) &&
3569                     level == cur->bc_nlevels - 2) {
3570                         error = xfs_btree_kill_iroot(cur);
3571                         if (!error)
3572                                 error = xfs_btree_dec_cursor(cur, level, stat);
3573                         if (error)
3574                                 goto error0;
3575                         return 0;
3576                 }
3577         }
3578
3579         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3580                !xfs_btree_ptr_is_null(cur, &lptr));
3581
3582         /*
3583          * Duplicate the cursor so our btree manipulations here won't
3584          * disrupt the next level up.
3585          */
3586         error = xfs_btree_dup_cursor(cur, &tcur);
3587         if (error)
3588                 goto error0;
3589
3590         /*
3591          * If there's a right sibling, see if it's ok to shift an entry
3592          * out of it.
3593          */
3594         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3595                 /*
3596                  * Move the temp cursor to the last entry in the next block.
3597                  * Actually any entry but the first would suffice.
3598                  */
3599                 i = xfs_btree_lastrec(tcur, level);
3600                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3601
3602                 error = xfs_btree_increment(tcur, level, &i);
3603                 if (error)
3604                         goto error0;
3605                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3606
3607                 i = xfs_btree_lastrec(tcur, level);
3608                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3609
3610                 /* Grab a pointer to the block. */
3611                 right = xfs_btree_get_block(tcur, level, &rbp);
3612 #ifdef DEBUG
3613                 error = xfs_btree_check_block(tcur, right, level, rbp);
3614                 if (error)
3615                         goto error0;
3616 #endif
3617                 /* Grab the current block number, for future use. */
3618                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3619
3620                 /*
3621                  * If right block is full enough so that removing one entry
3622                  * won't make it too empty, and left-shifting an entry out
3623                  * of right to us works, we're done.
3624                  */
3625                 if (xfs_btree_get_numrecs(right) - 1 >=
3626                     cur->bc_ops->get_minrecs(tcur, level)) {
3627                         error = xfs_btree_lshift(tcur, level, &i);
3628                         if (error)
3629                                 goto error0;
3630                         if (i) {
3631                                 ASSERT(xfs_btree_get_numrecs(block) >=
3632                                        cur->bc_ops->get_minrecs(tcur, level));
3633
3634                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3635                                 tcur = NULL;
3636
3637                                 error = xfs_btree_dec_cursor(cur, level, stat);
3638                                 if (error)
3639                                         goto error0;
3640                                 return 0;
3641                         }
3642                 }
3643
3644                 /*
3645                  * Otherwise, grab the number of records in right for
3646                  * future reference, and fix up the temp cursor to point
3647                  * to our block again (last record).
3648                  */
3649                 rrecs = xfs_btree_get_numrecs(right);
3650                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3651                         i = xfs_btree_firstrec(tcur, level);
3652                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3653
3654                         error = xfs_btree_decrement(tcur, level, &i);
3655                         if (error)
3656                                 goto error0;
3657                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3658                 }
3659         }
3660
3661         /*
3662          * If there's a left sibling, see if it's ok to shift an entry
3663          * out of it.
3664          */
3665         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3666                 /*
3667                  * Move the temp cursor to the first entry in the
3668                  * previous block.
3669                  */
3670                 i = xfs_btree_firstrec(tcur, level);
3671                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3672
3673                 error = xfs_btree_decrement(tcur, level, &i);
3674                 if (error)
3675                         goto error0;
3676                 i = xfs_btree_firstrec(tcur, level);
3677                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3678
3679                 /* Grab a pointer to the block. */
3680                 left = xfs_btree_get_block(tcur, level, &lbp);
3681 #ifdef DEBUG
3682                 error = xfs_btree_check_block(cur, left, level, lbp);
3683                 if (error)
3684                         goto error0;
3685 #endif
3686                 /* Grab the current block number, for future use. */
3687                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3688
3689                 /*
3690                  * If left block is full enough so that removing one entry
3691                  * won't make it too empty, and right-shifting an entry out
3692                  * of left to us works, we're done.
3693                  */
3694                 if (xfs_btree_get_numrecs(left) - 1 >=
3695                     cur->bc_ops->get_minrecs(tcur, level)) {
3696                         error = xfs_btree_rshift(tcur, level, &i);
3697                         if (error)
3698                                 goto error0;
3699                         if (i) {
3700                                 ASSERT(xfs_btree_get_numrecs(block) >=
3701                                        cur->bc_ops->get_minrecs(tcur, level));
3702                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3703                                 tcur = NULL;
3704                                 if (level == 0)
3705                                         cur->bc_ptrs[0]++;
3706                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3707                                 *stat = 1;
3708                                 return 0;
3709                         }
3710                 }
3711
3712                 /*
3713                  * Otherwise, grab the number of records in right for
3714                  * future reference.
3715                  */
3716                 lrecs = xfs_btree_get_numrecs(left);
3717         }
3718
3719         /* Delete the temp cursor, we're done with it. */
3720         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3721         tcur = NULL;
3722
3723         /* If here, we need to do a join to keep the tree balanced. */
3724         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3725
3726         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3727             lrecs + xfs_btree_get_numrecs(block) <=
3728                         cur->bc_ops->get_maxrecs(cur, level)) {
3729                 /*
3730                  * Set "right" to be the starting block,
3731                  * "left" to be the left neighbor.
3732                  */
3733                 rptr = cptr;
3734                 right = block;
3735                 rbp = bp;
3736                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3737                 if (error)
3738                         goto error0;
3739
3740         /*
3741          * If that won't work, see if we can join with the right neighbor block.
3742          */
3743         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3744                    rrecs + xfs_btree_get_numrecs(block) <=
3745                         cur->bc_ops->get_maxrecs(cur, level)) {
3746                 /*
3747                  * Set "left" to be the starting block,
3748                  * "right" to be the right neighbor.
3749                  */
3750                 lptr = cptr;
3751                 left = block;
3752                 lbp = bp;
3753                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3754                 if (error)
3755                         goto error0;
3756
3757         /*
3758          * Otherwise, we can't fix the imbalance.
3759          * Just return.  This is probably a logic error, but it's not fatal.
3760          */
3761         } else {
3762                 error = xfs_btree_dec_cursor(cur, level, stat);
3763                 if (error)
3764                         goto error0;
3765                 return 0;
3766         }
3767
3768         rrecs = xfs_btree_get_numrecs(right);
3769         lrecs = xfs_btree_get_numrecs(left);
3770
3771         /*
3772          * We're now going to join "left" and "right" by moving all the stuff
3773          * in "right" to "left" and deleting "right".
3774          */
3775         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3776         if (level > 0) {
3777                 /* It's a non-leaf.  Move keys and pointers. */
3778                 union xfs_btree_key     *lkp;   /* left btree key */
3779                 union xfs_btree_ptr     *lpp;   /* left address pointer */
3780                 union xfs_btree_key     *rkp;   /* right btree key */
3781                 union xfs_btree_ptr     *rpp;   /* right address pointer */
3782
3783                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3784                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3785                 rkp = xfs_btree_key_addr(cur, 1, right);
3786                 rpp = xfs_btree_ptr_addr(cur, 1, right);
3787 #ifdef DEBUG
3788                 for (i = 1; i < rrecs; i++) {
3789                         error = xfs_btree_check_ptr(cur, rpp, i, level);
3790                         if (error)
3791                                 goto error0;
3792                 }
3793 #endif
3794                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3795                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3796
3797                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3798                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3799         } else {
3800                 /* It's a leaf.  Move records.  */
3801                 union xfs_btree_rec     *lrp;   /* left record pointer */
3802                 union xfs_btree_rec     *rrp;   /* right record pointer */
3803
3804                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3805                 rrp = xfs_btree_rec_addr(cur, 1, right);
3806
3807                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3808                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3809         }
3810
3811         XFS_BTREE_STATS_INC(cur, join);
3812
3813         /*
3814          * Fix up the number of records and right block pointer in the
3815          * surviving block, and log it.
3816          */
3817         xfs_btree_set_numrecs(left, lrecs + rrecs);
3818         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3819         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3820         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3821
3822         /* If there is a right sibling, point it to the remaining block. */
3823         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3824         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3825                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3826                 if (error)
3827                         goto error0;
3828                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3829                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3830         }
3831
3832         /* Free the deleted block. */
3833         error = cur->bc_ops->free_block(cur, rbp);
3834         if (error)
3835                 goto error0;
3836         XFS_BTREE_STATS_INC(cur, free);
3837
3838         /*
3839          * If we joined with the left neighbor, set the buffer in the
3840          * cursor to the left block, and fix up the index.
3841          */
3842         if (bp != lbp) {
3843                 cur->bc_bufs[level] = lbp;
3844                 cur->bc_ptrs[level] += lrecs;
3845                 cur->bc_ra[level] = 0;
3846         }
3847         /*
3848          * If we joined with the right neighbor and there's a level above
3849          * us, increment the cursor at that level.
3850          */
3851         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3852                    (level + 1 < cur->bc_nlevels)) {
3853                 error = xfs_btree_increment(cur, level + 1, &i);
3854                 if (error)
3855                         goto error0;
3856         }
3857
3858         /*
3859          * Readjust the ptr at this level if it's not a leaf, since it's
3860          * still pointing at the deletion point, which makes the cursor
3861          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3862          * We can't use decrement because it would change the next level up.
3863          */
3864         if (level > 0)
3865                 cur->bc_ptrs[level]--;
3866
3867         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3868         /* Return value means the next level up has something to do. */
3869         *stat = 2;
3870         return 0;
3871
3872 error0:
3873         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3874         if (tcur)
3875                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3876         return error;
3877 }
3878
3879 /*
3880  * Delete the record pointed to by cur.
3881  * The cursor refers to the place where the record was (could be inserted)
3882  * when the operation returns.
3883  */
3884 int                                     /* error */
3885 xfs_btree_delete(
3886         struct xfs_btree_cur    *cur,
3887         int                     *stat)  /* success/failure */
3888 {
3889         int                     error;  /* error return value */
3890         int                     level;
3891         int                     i;
3892
3893         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3894
3895         /*
3896          * Go up the tree, starting at leaf level.
3897          *
3898          * If 2 is returned then a join was done; go to the next level.
3899          * Otherwise we are done.
3900          */
3901         for (level = 0, i = 2; i == 2; level++) {
3902                 error = xfs_btree_delrec(cur, level, &i);
3903                 if (error)
3904                         goto error0;
3905         }
3906
3907         if (i == 0) {
3908                 for (level = 1; level < cur->bc_nlevels; level++) {
3909                         if (cur->bc_ptrs[level] == 0) {
3910                                 error = xfs_btree_decrement(cur, level, &i);
3911                                 if (error)
3912                                         goto error0;
3913                                 break;
3914                         }
3915                 }
3916         }
3917
3918         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3919         *stat = i;
3920         return 0;
3921 error0:
3922         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3923         return error;
3924 }
3925
3926 /*
3927  * Get the data from the pointed-to record.
3928  */
3929 int                                     /* error */
3930 xfs_btree_get_rec(
3931         struct xfs_btree_cur    *cur,   /* btree cursor */
3932         union xfs_btree_rec     **recp, /* output: btree record */
3933         int                     *stat)  /* output: success/failure */
3934 {
3935         struct xfs_btree_block  *block; /* btree block */
3936         struct xfs_buf          *bp;    /* buffer pointer */
3937         int                     ptr;    /* record number */
3938 #ifdef DEBUG
3939         int                     error;  /* error return value */
3940 #endif
3941
3942         ptr = cur->bc_ptrs[0];
3943         block = xfs_btree_get_block(cur, 0, &bp);
3944
3945 #ifdef DEBUG
3946         error = xfs_btree_check_block(cur, block, 0, bp);
3947         if (error)
3948                 return error;
3949 #endif
3950
3951         /*
3952          * Off the right end or left end, return failure.
3953          */
3954         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3955                 *stat = 0;
3956                 return 0;
3957         }
3958
3959         /*
3960          * Point to the record and extract its data.
3961          */
3962         *recp = xfs_btree_rec_addr(cur, ptr, block);
3963         *stat = 1;
3964         return 0;
3965 }
3966
3967 /*
3968  * Change the owner of a btree.
3969  *
3970  * The mechanism we use here is ordered buffer logging. Because we don't know
3971  * how many buffers were are going to need to modify, we don't really want to
3972  * have to make transaction reservations for the worst case of every buffer in a
3973  * full size btree as that may be more space that we can fit in the log....
3974  *
3975  * We do the btree walk in the most optimal manner possible - we have sibling
3976  * pointers so we can just walk all the blocks on each level from left to right
3977  * in a single pass, and then move to the next level and do the same. We can
3978  * also do readahead on the sibling pointers to get IO moving more quickly,
3979  * though for slow disks this is unlikely to make much difference to performance
3980  * as the amount of CPU work we have to do before moving to the next block is
3981  * relatively small.
3982  *
3983  * For each btree block that we load, modify the owner appropriately, set the
3984  * buffer as an ordered buffer and log it appropriately. We need to ensure that
3985  * we mark the region we change dirty so that if the buffer is relogged in
3986  * a subsequent transaction the changes we make here as an ordered buffer are
3987  * correctly relogged in that transaction.  If we are in recovery context, then
3988  * just queue the modified buffer as delayed write buffer so the transaction
3989  * recovery completion writes the changes to disk.
3990  */
3991 static int
3992 xfs_btree_block_change_owner(
3993         struct xfs_btree_cur    *cur,
3994         int                     level,
3995         __uint64_t              new_owner,
3996         struct list_head        *buffer_list)
3997 {
3998         struct xfs_btree_block  *block;
3999         struct xfs_buf          *bp;
4000         union xfs_btree_ptr     rptr;
4001
4002         /* do right sibling readahead */
4003         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4004
4005         /* modify the owner */
4006         block = xfs_btree_get_block(cur, level, &bp);
4007         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4008                 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
4009         else
4010                 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
4011
4012         /*
4013          * If the block is a root block hosted in an inode, we might not have a
4014          * buffer pointer here and we shouldn't attempt to log the change as the
4015          * information is already held in the inode and discarded when the root
4016          * block is formatted into the on-disk inode fork. We still change it,
4017          * though, so everything is consistent in memory.
4018          */
4019         if (bp) {
4020                 if (cur->bc_tp) {
4021                         xfs_trans_ordered_buf(cur->bc_tp, bp);
4022                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4023                 } else {
4024                         xfs_buf_delwri_queue(bp, buffer_list);
4025                 }
4026         } else {
4027                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4028                 ASSERT(level == cur->bc_nlevels - 1);
4029         }
4030
4031         /* now read rh sibling block for next iteration */
4032         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4033         if (xfs_btree_ptr_is_null(cur, &rptr))
4034                 return -ENOENT;
4035
4036         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4037 }
4038
4039 int
4040 xfs_btree_change_owner(
4041         struct xfs_btree_cur    *cur,
4042         __uint64_t              new_owner,
4043         struct list_head        *buffer_list)
4044 {
4045         union xfs_btree_ptr     lptr;
4046         int                     level;
4047         struct xfs_btree_block  *block = NULL;
4048         int                     error = 0;
4049
4050         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4051
4052         /* for each level */
4053         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4054                 /* grab the left hand block */
4055                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4056                 if (error)
4057                         return error;
4058
4059                 /* readahead the left most block for the next level down */
4060                 if (level > 0) {
4061                         union xfs_btree_ptr     *ptr;
4062
4063                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4064                         xfs_btree_readahead_ptr(cur, ptr, 1);
4065
4066                         /* save for the next iteration of the loop */
4067                         lptr = *ptr;
4068                 }
4069
4070                 /* for each buffer in the level */
4071                 do {
4072                         error = xfs_btree_block_change_owner(cur, level,
4073                                                              new_owner,
4074                                                              buffer_list);
4075                 } while (!error);
4076
4077                 if (error != -ENOENT)
4078                         return error;
4079         }
4080
4081         return 0;
4082 }