Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / jfs / jfs_xtree.c
diff --git a/kernel/fs/jfs/jfs_xtree.c b/kernel/fs/jfs/jfs_xtree.c
new file mode 100644 (file)
index 0000000..5ad7748
--- /dev/null
@@ -0,0 +1,3903 @@
+/*
+ *   Copyright (C) International Business Machines Corp., 2000-2005
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/*
+ *     jfs_xtree.c: extent allocation descriptor B+-tree manager
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_dinode.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * xtree local flag
+ */
+#define XT_INSERT      0x00000001
+
+/*
+ *     xtree key/entry comparison: extent offset
+ *
+ * return:
+ *     -1: k < start of extent
+ *      0: start_of_extent <= k <= end_of_extent
+ *      1: k > end_of_extent
+ */
+#define XT_CMP(CMP, K, X, OFFSET64)\
+{\
+       OFFSET64 = offsetXAD(X);\
+       (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
+               ((K) < OFFSET64) ? -1 : 0;\
+}
+
+/* write a xad entry */
+#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
+{\
+       (XAD)->flag = (FLAG);\
+       XADoffset((XAD), (OFF));\
+       XADlength((XAD), (LEN));\
+       XADaddress((XAD), (ADDR));\
+}
+
+#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
+
+/* get page buffer for specified block address */
+/* ToDo: Replace this ugly macro with a function */
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)                            \
+do {                                                                   \
+       BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot);        \
+       if (!(RC)) {                                                    \
+               if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
+                   (le16_to_cpu((P)->header.nextindex) >               \
+                    le16_to_cpu((P)->header.maxentry)) ||              \
+                   (le16_to_cpu((P)->header.maxentry) >                \
+                    (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
+                       jfs_error((IP)->i_sb,                           \
+                                 "XT_GETPAGE: xtree page corrupt\n");  \
+                       BT_PUTPAGE(MP);                                 \
+                       MP = NULL;                                      \
+                       RC = -EIO;                                      \
+               }                                                       \
+       }                                                               \
+} while (0)
+
+/* for consistency */
+#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+       BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
+/* xtree entry parameter descriptor */
+struct xtsplit {
+       struct metapage *mp;
+       s16 index;
+       u8 flag;
+       s64 off;
+       s64 addr;
+       int len;
+       struct pxdlist *pxdlist;
+};
+
+
+/*
+ *     statistics
+ */
+#ifdef CONFIG_JFS_STATISTICS
+static struct {
+       uint search;
+       uint fastSearch;
+       uint split;
+} xtStat;
+#endif
+
+
+/*
+ * forward references
+ */
+static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp,
+                   struct btstack * btstack, int flag);
+
+static int xtSplitUp(tid_t tid,
+                    struct inode *ip,
+                    struct xtsplit * split, struct btstack * btstack);
+
+static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
+                      struct metapage ** rmpp, s64 * rbnp);
+
+static int xtSplitRoot(tid_t tid, struct inode *ip,
+                      struct xtsplit * split, struct metapage ** rmpp);
+
+#ifdef _STILL_TO_PORT
+static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+                     xtpage_t * fp, struct btstack * btstack);
+
+static int xtSearchNode(struct inode *ip,
+                       xad_t * xad,
+                       int *cmpp, struct btstack * btstack, int flag);
+
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
+#endif                         /*  _STILL_TO_PORT */
+
+/*
+ *     xtLookup()
+ *
+ * function: map a single page into a physical extent;
+ */
+int xtLookup(struct inode *ip, s64 lstart,
+            s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
+{
+       int rc = 0;
+       struct btstack btstack;
+       int cmp;
+       s64 bn;
+       struct metapage *mp;
+       xtpage_t *p;
+       int index;
+       xad_t *xad;
+       s64 next, size, xoff, xend;
+       int xlen;
+       s64 xaddr;
+
+       *paddr = 0;
+       *plen = llen;
+
+       if (!no_check) {
+               /* is lookup offset beyond eof ? */
+               size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+                   JFS_SBI(ip->i_sb)->l2bsize;
+               if (lstart >= size)
+                       return 0;
+       }
+
+       /*
+        * search for the xad entry covering the logical extent
+        */
+//search:
+       if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) {
+               jfs_err("xtLookup: xtSearch returned %d", rc);
+               return rc;
+       }
+
+       /*
+        *      compute the physical extent covering logical extent
+        *
+        * N.B. search may have failed (e.g., hole in sparse file),
+        * and returned the index of the next entry.
+        */
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+       /* is xad found covering start of logical extent ?
+        * lstart is a page start address,
+        * i.e., lstart cannot start in a hole;
+        */
+       if (cmp) {
+               if (next)
+                       *plen = min(next - lstart, llen);
+               goto out;
+       }
+
+       /*
+        * lxd covered by xad
+        */
+       xad = &p->xad[index];
+       xoff = offsetXAD(xad);
+       xlen = lengthXAD(xad);
+       xend = xoff + xlen;
+       xaddr = addressXAD(xad);
+
+       /* initialize new pxd */
+       *pflag = xad->flag;
+       *paddr = xaddr + (lstart - xoff);
+       /* a page must be fully covered by an xad */
+       *plen = min(xend - lstart, llen);
+
+      out:
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+
+/*
+ *     xtSearch()
+ *
+ * function:   search for the xad entry covering specified offset.
+ *
+ * parameters:
+ *     ip      - file object;
+ *     xoff    - extent offset;
+ *     nextp   - address of next extent (if any) for search miss
+ *     cmpp    - comparison result:
+ *     btstack - traverse stack;
+ *     flag    - search process flag (XT_INSERT);
+ *
+ * returns:
+ *     btstack contains (bn, index) of search path traversed to the entry.
+ *     *cmpp is set to result of comparison with the entry returned.
+ *     the page containing the entry is pinned at exit.
+ */
+static int xtSearch(struct inode *ip, s64 xoff,        s64 *nextp,
+                   int *cmpp, struct btstack * btstack, int flag)
+{
+       struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+       int rc = 0;
+       int cmp = 1;            /* init for empty page */
+       s64 bn;                 /* block number */
+       struct metapage *mp;    /* page buffer */
+       xtpage_t *p;            /* page */
+       xad_t *xad;
+       int base, index, lim, btindex;
+       struct btframe *btsp;
+       int nsplit = 0;         /* number of pages to split */
+       s64 t64;
+       s64 next = 0;
+
+       INCREMENT(xtStat.search);
+
+       BT_CLR(btstack);
+
+       btstack->nsplit = 0;
+
+       /*
+        *      search down tree from root:
+        *
+        * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+        * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+        *
+        * if entry with search key K is not found
+        * internal page search find the entry with largest key Ki
+        * less than K which point to the child page to search;
+        * leaf page search find the entry with smallest key Kj
+        * greater than K so that the returned index is the position of
+        * the entry to be shifted right for insertion of new entry.
+        * for empty tree, search key is greater than any key of the tree.
+        *
+        * by convention, root bn = 0.
+        */
+       for (bn = 0;;) {
+               /* get/pin the page to search */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               /* try sequential access heuristics with the previous
+                * access entry in target leaf page:
+                * once search narrowed down into the target leaf,
+                * key must either match an entry in the leaf or
+                * key entry does not exist in the tree;
+                */
+//fastSearch:
+               if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
+                   (p->header.flag & BT_LEAF) &&
+                   (index = jfs_ip->btindex) <
+                   le16_to_cpu(p->header.nextindex)) {
+                       xad = &p->xad[index];
+                       t64 = offsetXAD(xad);
+                       if (xoff < t64 + lengthXAD(xad)) {
+                               if (xoff >= t64) {
+                                       *cmpp = 0;
+                                       goto out;
+                               }
+
+                               /* stop sequential access heuristics */
+                               goto binarySearch;
+                       } else {        /* (t64 + lengthXAD(xad)) <= xoff */
+
+                               /* try next sequential entry */
+                               index++;
+                               if (index <
+                                   le16_to_cpu(p->header.nextindex)) {
+                                       xad++;
+                                       t64 = offsetXAD(xad);
+                                       if (xoff < t64 + lengthXAD(xad)) {
+                                               if (xoff >= t64) {
+                                                       *cmpp = 0;
+                                                       goto out;
+                                               }
+
+                                               /* miss: key falls between
+                                                * previous and this entry
+                                                */
+                                               *cmpp = 1;
+                                               next = t64;
+                                               goto out;
+                                       }
+
+                                       /* (xoff >= t64 + lengthXAD(xad));
+                                        * matching entry may be further out:
+                                        * stop heuristic search
+                                        */
+                                       /* stop sequential access heuristics */
+                                       goto binarySearch;
+                               }
+
+                               /* (index == p->header.nextindex);
+                                * miss: key entry does not exist in
+                                * the target leaf/tree
+                                */
+                               *cmpp = 1;
+                               goto out;
+                       }
+
+                       /*
+                        * if hit, return index of the entry found, and
+                        * if miss, where new entry with search key is
+                        * to be inserted;
+                        */
+                     out:
+                       /* compute number of pages to split */
+                       if (flag & XT_INSERT) {
+                               if (p->header.nextindex ==      /* little-endian */
+                                   p->header.maxentry)
+                                       nsplit++;
+                               else
+                                       nsplit = 0;
+                               btstack->nsplit = nsplit;
+                       }
+
+                       /* save search result */
+                       btsp = btstack->top;
+                       btsp->bn = bn;
+                       btsp->index = index;
+                       btsp->mp = mp;
+
+                       /* update sequential access heuristics */
+                       jfs_ip->btindex = index;
+
+                       if (nextp)
+                               *nextp = next;
+
+                       INCREMENT(xtStat.fastSearch);
+                       return 0;
+               }
+
+               /* well, ... full search now */
+             binarySearch:
+               lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+               /*
+                * binary search with search key K on the current page
+                */
+               for (base = XTENTRYSTART; lim; lim >>= 1) {
+                       index = base + (lim >> 1);
+
+                       XT_CMP(cmp, xoff, &p->xad[index], t64);
+                       if (cmp == 0) {
+                               /*
+                                *      search hit
+                                */
+                               /* search hit - leaf page:
+                                * return the entry found
+                                */
+                               if (p->header.flag & BT_LEAF) {
+                                       *cmpp = cmp;
+
+                                       /* compute number of pages to split */
+                                       if (flag & XT_INSERT) {
+                                               if (p->header.nextindex ==
+                                                   p->header.maxentry)
+                                                       nsplit++;
+                                               else
+                                                       nsplit = 0;
+                                               btstack->nsplit = nsplit;
+                                       }
+
+                                       /* save search result */
+                                       btsp = btstack->top;
+                                       btsp->bn = bn;
+                                       btsp->index = index;
+                                       btsp->mp = mp;
+
+                                       /* init sequential access heuristics */
+                                       btindex = jfs_ip->btindex;
+                                       if (index == btindex ||
+                                           index == btindex + 1)
+                                               jfs_ip->btorder = BT_SEQUENTIAL;
+                                       else
+                                               jfs_ip->btorder = BT_RANDOM;
+                                       jfs_ip->btindex = index;
+
+                                       return 0;
+                               }
+                               /* search hit - internal page:
+                                * descend/search its child page
+                                */
+                               if (index < le16_to_cpu(p->header.nextindex)-1)
+                                       next = offsetXAD(&p->xad[index + 1]);
+                               goto next;
+                       }
+
+                       if (cmp > 0) {
+                               base = index + 1;
+                               --lim;
+                       }
+               }
+
+               /*
+                *      search miss
+                *
+                * base is the smallest index with key (Kj) greater than
+                * search key (K) and may be zero or maxentry index.
+                */
+               if (base < le16_to_cpu(p->header.nextindex))
+                       next = offsetXAD(&p->xad[base]);
+               /*
+                * search miss - leaf page:
+                *
+                * return location of entry (base) where new entry with
+                * search key K is to be inserted.
+                */
+               if (p->header.flag & BT_LEAF) {
+                       *cmpp = cmp;
+
+                       /* compute number of pages to split */
+                       if (flag & XT_INSERT) {
+                               if (p->header.nextindex ==
+                                   p->header.maxentry)
+                                       nsplit++;
+                               else
+                                       nsplit = 0;
+                               btstack->nsplit = nsplit;
+                       }
+
+                       /* save search result */
+                       btsp = btstack->top;
+                       btsp->bn = bn;
+                       btsp->index = base;
+                       btsp->mp = mp;
+
+                       /* init sequential access heuristics */
+                       btindex = jfs_ip->btindex;
+                       if (base == btindex || base == btindex + 1)
+                               jfs_ip->btorder = BT_SEQUENTIAL;
+                       else
+                               jfs_ip->btorder = BT_RANDOM;
+                       jfs_ip->btindex = base;
+
+                       if (nextp)
+                               *nextp = next;
+
+                       return 0;
+               }
+
+               /*
+                * search miss - non-leaf page:
+                *
+                * if base is non-zero, decrement base by one to get the parent
+                * entry of the child page to search.
+                */
+               index = base ? base - 1 : base;
+
+               /*
+                * go down to child page
+                */
+             next:
+               /* update number of pages to split */
+               if (p->header.nextindex == p->header.maxentry)
+                       nsplit++;
+               else
+                       nsplit = 0;
+
+               /* push (bn, index) of the parent page/entry */
+               if (BT_STACK_FULL(btstack)) {
+                       jfs_error(ip->i_sb, "stack overrun!\n");
+                       XT_PUTPAGE(mp);
+                       return -EIO;
+               }
+               BT_PUSH(btstack, bn, index);
+
+               /* get the child page block number */
+               bn = addressXAD(&p->xad[index]);
+
+               /* unpin the parent page */
+               XT_PUTPAGE(mp);
+       }
+}
+
+/*
+ *     xtInsert()
+ *
+ * function:
+ *
+ * parameter:
+ *     tid     - transaction id;
+ *     ip      - file object;
+ *     xflag   - extent flag (XAD_NOTRECORDED):
+ *     xoff    - extent offset;
+ *     xlen    - extent length;
+ *     xaddrp  - extent address pointer (in/out):
+ *             if (*xaddrp)
+ *                     caller allocated data extent at *xaddrp;
+ *             else
+ *                     allocate data extent and return its xaddr;
+ *     flag    -
+ *
+ * return:
+ */
+int xtInsert(tid_t tid,                /* transaction id */
+            struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
+            int flag)
+{
+       int rc = 0;
+       s64 xaddr, hint;
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* base B+-tree index page */
+       s64 bn;
+       int index, nextindex;
+       struct btstack btstack; /* traverse stack */
+       struct xtsplit split;   /* split information */
+       xad_t *xad;
+       int cmp;
+       s64 next;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+
+       jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+       /*
+        *      search for the entry location at which to insert:
+        *
+        * xtFastSearch() and xtSearch() both returns (leaf page
+        * pinned, index at which to insert).
+        * n.b. xtSearch() may return index of maxentry of
+        * the full page.
+        */
+       if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+               return rc;
+
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+       /* This test must follow XT_GETSEARCH since mp must be valid if
+        * we branch to out: */
+       if ((cmp == 0) || (next && (xlen > next - xoff))) {
+               rc = -EEXIST;
+               goto out;
+       }
+
+       /*
+        * allocate data extent requested
+        *
+        * allocation hint: last xad
+        */
+       if ((xaddr = *xaddrp) == 0) {
+               if (index > XTENTRYSTART) {
+                       xad = &p->xad[index - 1];
+                       hint = addressXAD(xad) + lengthXAD(xad) - 1;
+               } else
+                       hint = 0;
+               if ((rc = dquot_alloc_block(ip, xlen)))
+                       goto out;
+               if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
+                       dquot_free_block(ip, xlen);
+                       goto out;
+               }
+       }
+
+       /*
+        *      insert entry for new extent
+        */
+       xflag |= XAD_NEW;
+
+       /*
+        *      if the leaf page is full, split the page and
+        *      propagate up the router entry for the new page from split
+        *
+        * The xtSplitUp() will insert the entry and unpin the leaf page.
+        */
+       nextindex = le16_to_cpu(p->header.nextindex);
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+               split.mp = mp;
+               split.index = index;
+               split.flag = xflag;
+               split.off = xoff;
+               split.len = xlen;
+               split.addr = xaddr;
+               split.pxdlist = NULL;
+               if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+                       /* undo data extent allocation */
+                       if (*xaddrp == 0) {
+                               dbFree(ip, xaddr, (s64) xlen);
+                               dquot_free_block(ip, xlen);
+                       }
+                       return rc;
+               }
+
+               *xaddrp = xaddr;
+               return 0;
+       }
+
+       /*
+        *      insert the new entry into the leaf page
+        */
+       /*
+        * acquire a transaction lock on the leaf page;
+        *
+        * action: xad insertion/extension;
+        */
+       BT_MARK_DIRTY(mp, ip);
+
+       /* if insert into middle, shift right remaining entries. */
+       if (index < nextindex)
+               memmove(&p->xad[index + 1], &p->xad[index],
+                       (nextindex - index) * sizeof(xad_t));
+
+       /* insert the new entry: mark the entry NEW */
+       xad = &p->xad[index];
+       XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+       /* advance next available entry index */
+       le16_add_cpu(&p->header.nextindex, 1);
+
+       /* Don't log it if there are no links to the file */
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+               xtlck = (struct xtlock *) & tlck->lock;
+               xtlck->lwm.offset =
+                   (xtlck->lwm.offset) ? min(index,
+                                             (int)xtlck->lwm.offset) : index;
+               xtlck->lwm.length =
+                   le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+       }
+
+       *xaddrp = xaddr;
+
+      out:
+       /* unpin the leaf page */
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+
+
+/*
+ *     xtSplitUp()
+ *
+ * function:
+ *     split full pages as propagating insertion up the tree
+ *
+ * parameter:
+ *     tid     - transaction id;
+ *     ip      - file object;
+ *     split   - entry parameter descriptor;
+ *     btstack - traverse stack from xtSearch()
+ *
+ * return:
+ */
+static int
+xtSplitUp(tid_t tid,
+         struct inode *ip, struct xtsplit * split, struct btstack * btstack)
+{
+       int rc = 0;
+       struct metapage *smp;
+       xtpage_t *sp;           /* split page */
+       struct metapage *rmp;
+       s64 rbn;                /* new right page block number */
+       struct metapage *rcmp;
+       xtpage_t *rcp;          /* right child page */
+       s64 rcbn;               /* right child page block number */
+       int skip;               /* index of entry of insertion */
+       int nextindex;          /* next available entry index of p */
+       struct btframe *parent; /* parent page entry on traverse stack */
+       xad_t *xad;
+       s64 xaddr;
+       int xlen;
+       int nsplit;             /* number of pages split */
+       struct pxdlist pxdlist;
+       pxd_t *pxd;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+
+       smp = split->mp;
+       sp = XT_PAGE(ip, smp);
+
+       /* is inode xtree root extension/inline EA area free ? */
+       if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
+           (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
+           (JFS_IP(ip)->mode2 & INLINEEA)) {
+               sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
+               JFS_IP(ip)->mode2 &= ~INLINEEA;
+
+               BT_MARK_DIRTY(smp, ip);
+               /*
+                * acquire a transaction lock on the leaf page;
+                *
+                * action: xad insertion/extension;
+                */
+
+               /* if insert into middle, shift right remaining entries. */
+               skip = split->index;
+               nextindex = le16_to_cpu(sp->header.nextindex);
+               if (skip < nextindex)
+                       memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                               (nextindex - skip) * sizeof(xad_t));
+
+               /* insert the new entry: mark the entry NEW */
+               xad = &sp->xad[skip];
+               XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                           split->addr);
+
+               /* advance next available entry index */
+               le16_add_cpu(&sp->header.nextindex, 1);
+
+               /* Don't log it if there are no links to the file */
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+                       xtlck = (struct xtlock *) & tlck->lock;
+                       xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                           min(skip, (int)xtlck->lwm.offset) : skip;
+                       xtlck->lwm.length =
+                           le16_to_cpu(sp->header.nextindex) -
+                           xtlck->lwm.offset;
+               }
+
+               return 0;
+       }
+
+       /*
+        * allocate new index blocks to cover index page split(s)
+        *
+        * allocation hint: ?
+        */
+       if (split->pxdlist == NULL) {
+               nsplit = btstack->nsplit;
+               split->pxdlist = &pxdlist;
+               pxdlist.maxnpxd = pxdlist.npxd = 0;
+               pxd = &pxdlist.pxd[0];
+               xlen = JFS_SBI(ip->i_sb)->nbperpage;
+               for (; nsplit > 0; nsplit--, pxd++) {
+                       if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
+                           == 0) {
+                               PXDaddress(pxd, xaddr);
+                               PXDlength(pxd, xlen);
+
+                               pxdlist.maxnpxd++;
+
+                               continue;
+                       }
+
+                       /* undo allocation */
+
+                       XT_PUTPAGE(smp);
+                       return rc;
+               }
+       }
+
+       /*
+        * Split leaf page <sp> into <sp> and a new right page <rp>.
+        *
+        * The split routines insert the new entry into the leaf page,
+        * and acquire txLock as appropriate.
+        * return <rp> pinned and its block number <rpbn>.
+        */
+       rc = (sp->header.flag & BT_ROOT) ?
+           xtSplitRoot(tid, ip, split, &rmp) :
+           xtSplitPage(tid, ip, split, &rmp, &rbn);
+
+       XT_PUTPAGE(smp);
+
+       if (rc)
+               return -EIO;
+       /*
+        * propagate up the router entry for the leaf page just split
+        *
+        * insert a router entry for the new page into the parent page,
+        * propagate the insert/split up the tree by walking back the stack
+        * of (bn of parent page, index of child page entry in parent page)
+        * that were traversed during the search for the page that split.
+        *
+        * the propagation of insert/split up the tree stops if the root
+        * splits or the page inserted into doesn't have to split to hold
+        * the new entry.
+        *
+        * the parent entry for the split page remains the same, and
+        * a new entry is inserted at its right with the first key and
+        * block number of the new right page.
+        *
+        * There are a maximum of 3 pages pinned at any time:
+        * right child, left parent and right parent (when the parent splits)
+        * to keep the child page pinned while working on the parent.
+        * make sure that all pins are released at exit.
+        */
+       while ((parent = BT_POP(btstack)) != NULL) {
+               /* parent page specified by stack frame <parent> */
+
+               /* keep current child pages <rcp> pinned */
+               rcmp = rmp;
+               rcbn = rbn;
+               rcp = XT_PAGE(ip, rcmp);
+
+               /*
+                * insert router entry in parent for new right child page <rp>
+                */
+               /* get/pin the parent page <sp> */
+               XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+               if (rc) {
+                       XT_PUTPAGE(rcmp);
+                       return rc;
+               }
+
+               /*
+                * The new key entry goes ONE AFTER the index of parent entry,
+                * because the split was to the right.
+                */
+               skip = parent->index + 1;
+
+               /*
+                * split or shift right remaining entries of the parent page
+                */
+               nextindex = le16_to_cpu(sp->header.nextindex);
+               /*
+                * parent page is full - split the parent page
+                */
+               if (nextindex == le16_to_cpu(sp->header.maxentry)) {
+                       /* init for parent page split */
+                       split->mp = smp;
+                       split->index = skip;    /* index at insert */
+                       split->flag = XAD_NEW;
+                       split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
+                       split->len = JFS_SBI(ip->i_sb)->nbperpage;
+                       split->addr = rcbn;
+
+                       /* unpin previous right child page */
+                       XT_PUTPAGE(rcmp);
+
+                       /* The split routines insert the new entry,
+                        * and acquire txLock as appropriate.
+                        * return <rp> pinned and its block number <rpbn>.
+                        */
+                       rc = (sp->header.flag & BT_ROOT) ?
+                           xtSplitRoot(tid, ip, split, &rmp) :
+                           xtSplitPage(tid, ip, split, &rmp, &rbn);
+                       if (rc) {
+                               XT_PUTPAGE(smp);
+                               return rc;
+                       }
+
+                       XT_PUTPAGE(smp);
+                       /* keep new child page <rp> pinned */
+               }
+               /*
+                * parent page is not full - insert in parent page
+                */
+               else {
+                       /*
+                        * insert router entry in parent for the right child
+                        * page from the first entry of the right child page:
+                        */
+                       /*
+                        * acquire a transaction lock on the parent page;
+                        *
+                        * action: router xad insertion;
+                        */
+                       BT_MARK_DIRTY(smp, ip);
+
+                       /*
+                        * if insert into middle, shift right remaining entries
+                        */
+                       if (skip < nextindex)
+                               memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                                       (nextindex -
+                                        skip) << L2XTSLOTSIZE);
+
+                       /* insert the router entry */
+                       xad = &sp->xad[skip];
+                       XT_PUTENTRY(xad, XAD_NEW,
+                                   offsetXAD(&rcp->xad[XTENTRYSTART]),
+                                   JFS_SBI(ip->i_sb)->nbperpage, rcbn);
+
+                       /* advance next available entry index. */
+                       le16_add_cpu(&sp->header.nextindex, 1);
+
+                       /* Don't log it if there are no links to the file */
+                       if (!test_cflag(COMMIT_Nolink, ip)) {
+                               tlck = txLock(tid, ip, smp,
+                                             tlckXTREE | tlckGROW);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                               xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                                   min(skip, (int)xtlck->lwm.offset) : skip;
+                               xtlck->lwm.length =
+                                   le16_to_cpu(sp->header.nextindex) -
+                                   xtlck->lwm.offset;
+                       }
+
+                       /* unpin parent page */
+                       XT_PUTPAGE(smp);
+
+                       /* exit propagate up */
+                       break;
+               }
+       }
+
+       /* unpin current right page */
+       XT_PUTPAGE(rmp);
+
+       return 0;
+}
+
+
+/*
+ *     xtSplitPage()
+ *
+ * function:
+ *     split a full non-root page into
+ *     original/split/left page and new right page
+ *     i.e., the original/split page remains as left page.
+ *
+ * parameter:
+ *     int             tid,
+ *     struct inode    *ip,
+ *     struct xtsplit  *split,
+ *     struct metapage **rmpp,
+ *     u64             *rbnp,
+ *
+ * return:
+ *     Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitPage(tid_t tid, struct inode *ip,
+           struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
+{
+       int rc = 0;
+       struct metapage *smp;
+       xtpage_t *sp;
+       struct metapage *rmp;
+       xtpage_t *rp;           /* new right page allocated */
+       s64 rbn;                /* new right page block number */
+       struct metapage *mp;
+       xtpage_t *p;
+       s64 nextbn;
+       int skip, maxentry, middle, righthalf, n;
+       xad_t *xad;
+       struct pxdlist *pxdlist;
+       pxd_t *pxd;
+       struct tlock *tlck;
+       struct xtlock *sxtlck = NULL, *rxtlck = NULL;
+       int quota_allocation = 0;
+
+       smp = split->mp;
+       sp = XT_PAGE(ip, smp);
+
+       INCREMENT(xtStat.split);
+
+       pxdlist = split->pxdlist;
+       pxd = &pxdlist->pxd[pxdlist->npxd];
+       pxdlist->npxd++;
+       rbn = addressPXD(pxd);
+
+       /* Allocate blocks to quota. */
+       rc = dquot_alloc_block(ip, lengthPXD(pxd));
+       if (rc)
+               goto clean_up;
+
+       quota_allocation += lengthPXD(pxd);
+
+       /*
+        * allocate the new right page for the split
+        */
+       rmp = get_metapage(ip, rbn, PSIZE, 1);
+       if (rmp == NULL) {
+               rc = -EIO;
+               goto clean_up;
+       }
+
+       jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
+
+       BT_MARK_DIRTY(rmp, ip);
+       /*
+        * action: new page;
+        */
+
+       rp = (xtpage_t *) rmp->data;
+       rp->header.self = *pxd;
+       rp->header.flag = sp->header.flag & BT_TYPE;
+       rp->header.maxentry = sp->header.maxentry;      /* little-endian */
+       rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+       BT_MARK_DIRTY(smp, ip);
+       /* Don't log it if there are no links to the file */
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               /*
+                * acquire a transaction lock on the new right page;
+                */
+               tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+               rxtlck = (struct xtlock *) & tlck->lock;
+               rxtlck->lwm.offset = XTENTRYSTART;
+               /*
+                * acquire a transaction lock on the split page
+                */
+               tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
+               sxtlck = (struct xtlock *) & tlck->lock;
+       }
+
+       /*
+        * initialize/update sibling pointers of <sp> and <rp>
+        */
+       nextbn = le64_to_cpu(sp->header.next);
+       rp->header.next = cpu_to_le64(nextbn);
+       rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
+       sp->header.next = cpu_to_le64(rbn);
+
+       skip = split->index;
+
+       /*
+        *      sequential append at tail (after last entry of last page)
+        *
+        * if splitting the last page on a level because of appending
+        * a entry to it (skip is maxentry), it's likely that the access is
+        * sequential. adding an empty page on the side of the level is less
+        * work and can push the fill factor much higher than normal.
+        * if we're wrong it's no big deal -  we will do the split the right
+        * way next time.
+        * (it may look like it's equally easy to do a similar hack for
+        * reverse sorted data, that is, split the tree left, but it's not.
+        * Be my guest.)
+        */
+       if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
+               /*
+                * acquire a transaction lock on the new/right page;
+                *
+                * action: xad insertion;
+                */
+               /* insert entry at the first entry of the new right page */
+               xad = &rp->xad[XTENTRYSTART];
+               XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                           split->addr);
+
+               rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       /* rxtlck->lwm.offset = XTENTRYSTART; */
+                       rxtlck->lwm.length = 1;
+               }
+
+               *rmpp = rmp;
+               *rbnp = rbn;
+
+               jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+               return 0;
+       }
+
+       /*
+        *      non-sequential insert (at possibly middle page)
+        */
+
+       /*
+        * update previous pointer of old next/right page of <sp>
+        */
+       if (nextbn != 0) {
+               XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+               if (rc) {
+                       XT_PUTPAGE(rmp);
+                       goto clean_up;
+               }
+
+               BT_MARK_DIRTY(mp, ip);
+               /*
+                * acquire a transaction lock on the next page;
+                *
+                * action:sibling pointer update;
+                */
+               if (!test_cflag(COMMIT_Nolink, ip))
+                       tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+               p->header.prev = cpu_to_le64(rbn);
+
+               /* sibling page may have been updated previously, or
+                * it may be updated later;
+                */
+
+               XT_PUTPAGE(mp);
+       }
+
+       /*
+        * split the data between the split and new/right pages
+        */
+       maxentry = le16_to_cpu(sp->header.maxentry);
+       middle = maxentry >> 1;
+       righthalf = maxentry - middle;
+
+       /*
+        * skip index in old split/left page - insert into left page:
+        */
+       if (skip <= middle) {
+               /* move right half of split page to the new right page */
+               memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+                       righthalf << L2XTSLOTSIZE);
+
+               /* shift right tail of left half to make room for new entry */
+               if (skip < middle)
+                       memmove(&sp->xad[skip + 1], &sp->xad[skip],
+                               (middle - skip) << L2XTSLOTSIZE);
+
+               /* insert new entry */
+               xad = &sp->xad[skip];
+               XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                           split->addr);
+
+               /* update page header */
+               sp->header.nextindex = cpu_to_le16(middle + 1);
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+                           min(skip, (int)sxtlck->lwm.offset) : skip;
+               }
+
+               rp->header.nextindex =
+                   cpu_to_le16(XTENTRYSTART + righthalf);
+       }
+       /*
+        * skip index in new right page - insert into right page:
+        */
+       else {
+               /* move left head of right half to right page */
+               n = skip - middle;
+               memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
+                       n << L2XTSLOTSIZE);
+
+               /* insert new entry */
+               n += XTENTRYSTART;
+               xad = &rp->xad[n];
+               XT_PUTENTRY(xad, split->flag, split->off, split->len,
+                           split->addr);
+
+               /* move right tail of right half to right page */
+               if (skip < maxentry)
+                       memmove(&rp->xad[n + 1], &sp->xad[skip],
+                               (maxentry - skip) << L2XTSLOTSIZE);
+
+               /* update page header */
+               sp->header.nextindex = cpu_to_le16(middle);
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
+                           min(middle, (int)sxtlck->lwm.offset) : middle;
+               }
+
+               rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
+                                                  righthalf + 1);
+       }
+
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
+                   sxtlck->lwm.offset;
+
+               /* rxtlck->lwm.offset = XTENTRYSTART; */
+               rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+                   XTENTRYSTART;
+       }
+
+       *rmpp = rmp;
+       *rbnp = rbn;
+
+       jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
+       return rc;
+
+      clean_up:
+
+       /* Rollback quota allocation. */
+       if (quota_allocation)
+               dquot_free_block(ip, quota_allocation);
+
+       return (rc);
+}
+
+
+/*
+ *     xtSplitRoot()
+ *
+ * function:
+ *     split the full root page into original/root/split page and new
+ *     right page
+ *     i.e., root remains fixed in tree anchor (inode) and the root is
+ *     copied to a single new right child page since root page <<
+ *     non-root page, and the split root page contains a single entry
+ *     for the new right child page.
+ *
+ * parameter:
+ *     int             tid,
+ *     struct inode    *ip,
+ *     struct xtsplit  *split,
+ *     struct metapage **rmpp)
+ *
+ * return:
+ *     Pointer to page in which to insert or NULL on error.
+ */
+static int
+xtSplitRoot(tid_t tid,
+           struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
+{
+       xtpage_t *sp;
+       struct metapage *rmp;
+       xtpage_t *rp;
+       s64 rbn;
+       int skip, nextindex;
+       xad_t *xad;
+       pxd_t *pxd;
+       struct pxdlist *pxdlist;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+       int rc;
+
+       sp = &JFS_IP(ip)->i_xtroot;
+
+       INCREMENT(xtStat.split);
+
+       /*
+        *      allocate a single (right) child page
+        */
+       pxdlist = split->pxdlist;
+       pxd = &pxdlist->pxd[pxdlist->npxd];
+       pxdlist->npxd++;
+       rbn = addressPXD(pxd);
+       rmp = get_metapage(ip, rbn, PSIZE, 1);
+       if (rmp == NULL)
+               return -EIO;
+
+       /* Allocate blocks to quota. */
+       rc = dquot_alloc_block(ip, lengthPXD(pxd));
+       if (rc) {
+               release_metapage(rmp);
+               return rc;
+       }
+
+       jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
+
+       /*
+        * acquire a transaction lock on the new right page;
+        *
+        * action: new page;
+        */
+       BT_MARK_DIRTY(rmp, ip);
+
+       rp = (xtpage_t *) rmp->data;
+       rp->header.flag =
+           (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
+       rp->header.self = *pxd;
+       rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+       rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
+
+       /* initialize sibling pointers */
+       rp->header.next = 0;
+       rp->header.prev = 0;
+
+       /*
+        * copy the in-line root page into new right page extent
+        */
+       nextindex = le16_to_cpu(sp->header.maxentry);
+       memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
+               (nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
+
+       /*
+        * insert the new entry into the new right/child page
+        * (skip index in the new right page will not change)
+        */
+       skip = split->index;
+       /* if insert into middle, shift right remaining entries */
+       if (skip != nextindex)
+               memmove(&rp->xad[skip + 1], &rp->xad[skip],
+                       (nextindex - skip) * sizeof(xad_t));
+
+       xad = &rp->xad[skip];
+       XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
+
+       /* update page header */
+       rp->header.nextindex = cpu_to_le16(nextindex + 1);
+
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
+               xtlck = (struct xtlock *) & tlck->lock;
+               xtlck->lwm.offset = XTENTRYSTART;
+               xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
+                   XTENTRYSTART;
+       }
+
+       /*
+        *      reset the root
+        *
+        * init root with the single entry for the new right page
+        * set the 1st entry offset to 0, which force the left-most key
+        * at any level of the tree to be less than any search key.
+        */
+       /*
+        * acquire a transaction lock on the root page (in-memory inode);
+        *
+        * action: root split;
+        */
+       BT_MARK_DIRTY(split->mp, ip);
+
+       xad = &sp->xad[XTENTRYSTART];
+       XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
+
+       /* update page header of root */
+       sp->header.flag &= ~BT_LEAF;
+       sp->header.flag |= BT_INTERNAL;
+
+       sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
+
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
+               xtlck = (struct xtlock *) & tlck->lock;
+               xtlck->lwm.offset = XTENTRYSTART;
+               xtlck->lwm.length = 1;
+       }
+
+       *rmpp = rmp;
+
+       jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
+       return 0;
+}
+
+
+/*
+ *     xtExtend()
+ *
+ * function: extend in-place;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: alloc whole extended extent;
+ */
+int xtExtend(tid_t tid,                /* transaction id */
+            struct inode *ip, s64 xoff,        /* delta extent offset */
+            s32 xlen,          /* delta extent length */
+            int flag)
+{
+       int rc = 0;
+       int cmp;
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* base B+-tree index page */
+       s64 bn;
+       int index, nextindex, len;
+       struct btstack btstack; /* traverse stack */
+       struct xtsplit split;   /* split information */
+       xad_t *xad;
+       s64 xaddr;
+       struct tlock *tlck;
+       struct xtlock *xtlck = NULL;
+
+       jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
+
+       /* there must exist extent to be extended */
+       if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT)))
+               return rc;
+
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+       if (cmp != 0) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "xtSearch did not find extent\n");
+               return -EIO;
+       }
+
+       /* extension must be contiguous */
+       xad = &p->xad[index];
+       if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "extension is not contiguous\n");
+               return -EIO;
+       }
+
+       /*
+        * acquire a transaction lock on the leaf page;
+        *
+        * action: xad insertion/extension;
+        */
+       BT_MARK_DIRTY(mp, ip);
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+               xtlck = (struct xtlock *) & tlck->lock;
+       }
+
+       /* extend will overflow extent ? */
+       xlen = lengthXAD(xad) + xlen;
+       if ((len = xlen - MAXXLEN) <= 0)
+               goto extendOld;
+
+       /*
+        *      extent overflow: insert entry for new extent
+        */
+//insertNew:
+       xoff = offsetXAD(xad) + MAXXLEN;
+       xaddr = addressXAD(xad) + MAXXLEN;
+       nextindex = le16_to_cpu(p->header.nextindex);
+
+       /*
+        *      if the leaf page is full, insert the new entry and
+        *      propagate up the router entry for the new page from split
+        *
+        * The xtSplitUp() will insert the entry and unpin the leaf page.
+        */
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+               /* xtSpliUp() unpins leaf pages */
+               split.mp = mp;
+               split.index = index + 1;
+               split.flag = XAD_NEW;
+               split.off = xoff;       /* split offset */
+               split.len = len;
+               split.addr = xaddr;
+               split.pxdlist = NULL;
+               if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                       return rc;
+
+               /* get back old page */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+               /*
+                * if leaf root has been split, original root has been
+                * copied to new child page, i.e., original entry now
+                * resides on the new child page;
+                */
+               if (p->header.flag & BT_INTERNAL) {
+                       ASSERT(p->header.nextindex ==
+                              cpu_to_le16(XTENTRYSTART + 1));
+                       xad = &p->xad[XTENTRYSTART];
+                       bn = addressXAD(xad);
+                       XT_PUTPAGE(mp);
+
+                       /* get new child page */
+                       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                       if (rc)
+                               return rc;
+
+                       BT_MARK_DIRTY(mp, ip);
+                       if (!test_cflag(COMMIT_Nolink, ip)) {
+                               tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                       }
+               }
+       }
+       /*
+        *      insert the new entry into the leaf page
+        */
+       else {
+               /* insert the new entry: mark the entry NEW */
+               xad = &p->xad[index + 1];
+               XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
+
+               /* advance next available entry index */
+               le16_add_cpu(&p->header.nextindex, 1);
+       }
+
+       /* get back old entry */
+       xad = &p->xad[index];
+       xlen = MAXXLEN;
+
+       /*
+        * extend old extent
+        */
+      extendOld:
+       XADlength(xad, xlen);
+       if (!(xad->flag & XAD_NEW))
+               xad->flag |= XAD_EXTENDED;
+
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               xtlck->lwm.offset =
+                   (xtlck->lwm.offset) ? min(index,
+                                             (int)xtlck->lwm.offset) : index;
+               xtlck->lwm.length =
+                   le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+       }
+
+       /* unpin the leaf page */
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+
+#ifdef _NOTYET
+/*
+ *     xtTailgate()
+ *
+ * function: split existing 'tail' extent
+ *     (split offset >= start offset of tail extent), and
+ *     relocate and extend the split tail half;
+ *
+ * note: existing extent may or may not have been committed.
+ * caller is responsible for pager buffer cache update, and
+ * working block allocation map update;
+ * update pmap: free old split tail extent, alloc new extent;
+ */
+int xtTailgate(tid_t tid,              /* transaction id */
+              struct inode *ip, s64 xoff,      /* split/new extent offset */
+              s32 xlen,        /* new extent length */
+              s64 xaddr,       /* new extent address */
+              int flag)
+{
+       int rc = 0;
+       int cmp;
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* base B+-tree index page */
+       s64 bn;
+       int index, nextindex, llen, rlen;
+       struct btstack btstack; /* traverse stack */
+       struct xtsplit split;   /* split information */
+       xad_t *xad;
+       struct tlock *tlck;
+       struct xtlock *xtlck = 0;
+       struct tlock *mtlck;
+       struct maplock *pxdlock;
+
+/*
+printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
+       (ulong)xoff, xlen, (ulong)xaddr);
+*/
+
+       /* there must exist extent to be tailgated */
+       if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
+               return rc;
+
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+       if (cmp != 0) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "couldn't find extent\n");
+               return -EIO;
+       }
+
+       /* entry found must be last entry */
+       nextindex = le16_to_cpu(p->header.nextindex);
+       if (index != nextindex - 1) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "the entry found is not the last entry\n");
+               return -EIO;
+       }
+
+       BT_MARK_DIRTY(mp, ip);
+       /*
+        * acquire tlock of the leaf page containing original entry
+        */
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+               xtlck = (struct xtlock *) & tlck->lock;
+       }
+
+       /* completely replace extent ? */
+       xad = &p->xad[index];
+/*
+printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
+       (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
+*/
+       if ((llen = xoff - offsetXAD(xad)) == 0)
+               goto updateOld;
+
+       /*
+        *      partially replace extent: insert entry for new extent
+        */
+//insertNew:
+       /*
+        *      if the leaf page is full, insert the new entry and
+        *      propagate up the router entry for the new page from split
+        *
+        * The xtSplitUp() will insert the entry and unpin the leaf page.
+        */
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+               /* xtSpliUp() unpins leaf pages */
+               split.mp = mp;
+               split.index = index + 1;
+               split.flag = XAD_NEW;
+               split.off = xoff;       /* split offset */
+               split.len = xlen;
+               split.addr = xaddr;
+               split.pxdlist = NULL;
+               if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                       return rc;
+
+               /* get back old page */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+               /*
+                * if leaf root has been split, original root has been
+                * copied to new child page, i.e., original entry now
+                * resides on the new child page;
+                */
+               if (p->header.flag & BT_INTERNAL) {
+                       ASSERT(p->header.nextindex ==
+                              cpu_to_le16(XTENTRYSTART + 1));
+                       xad = &p->xad[XTENTRYSTART];
+                       bn = addressXAD(xad);
+                       XT_PUTPAGE(mp);
+
+                       /* get new child page */
+                       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                       if (rc)
+                               return rc;
+
+                       BT_MARK_DIRTY(mp, ip);
+                       if (!test_cflag(COMMIT_Nolink, ip)) {
+                               tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                       }
+               }
+       }
+       /*
+        *      insert the new entry into the leaf page
+        */
+       else {
+               /* insert the new entry: mark the entry NEW */
+               xad = &p->xad[index + 1];
+               XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+               /* advance next available entry index */
+               le16_add_cpu(&p->header.nextindex, 1);
+       }
+
+       /* get back old XAD */
+       xad = &p->xad[index];
+
+       /*
+        * truncate/relocate old extent at split offset
+        */
+      updateOld:
+       /* update dmap for old/committed/truncated extent */
+       rlen = lengthXAD(xad) - llen;
+       if (!(xad->flag & XAD_NEW)) {
+               /* free from PWMAP at commit */
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       mtlck = txMaplock(tid, ip, tlckMAP);
+                       pxdlock = (struct maplock *) & mtlck->lock;
+                       pxdlock->flag = mlckFREEPXD;
+                       PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
+                       PXDlength(&pxdlock->pxd, rlen);
+                       pxdlock->index = 1;
+               }
+       } else
+               /* free from WMAP */
+               dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
+
+       if (llen)
+               /* truncate */
+               XADlength(xad, llen);
+       else
+               /* replace */
+               XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
+
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                   min(index, (int)xtlck->lwm.offset) : index;
+               xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+                   xtlck->lwm.offset;
+       }
+
+       /* unpin the leaf page */
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+#endif /* _NOTYET */
+
+/*
+ *     xtUpdate()
+ *
+ * function: update XAD;
+ *
+ *     update extent for allocated_but_not_recorded or
+ *     compressed extent;
+ *
+ * parameter:
+ *     nxad    - new XAD;
+ *             logical extent of the specified XAD must be completely
+ *             contained by an existing XAD;
+ */
+int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
+{                              /* new XAD */
+       int rc = 0;
+       int cmp;
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* base B+-tree index page */
+       s64 bn;
+       int index0, index, newindex, nextindex;
+       struct btstack btstack; /* traverse stack */
+       struct xtsplit split;   /* split information */
+       xad_t *xad, *lxad, *rxad;
+       int xflag;
+       s64 nxoff, xoff;
+       int nxlen, xlen, lxlen, rxlen;
+       s64 nxaddr, xaddr;
+       struct tlock *tlck;
+       struct xtlock *xtlck = NULL;
+       int newpage = 0;
+
+       /* there must exist extent to be tailgated */
+       nxoff = offsetXAD(nxad);
+       nxlen = lengthXAD(nxad);
+       nxaddr = addressXAD(nxad);
+
+       if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+               return rc;
+
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+       if (cmp != 0) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "Could not find extent\n");
+               return -EIO;
+       }
+
+       BT_MARK_DIRTY(mp, ip);
+       /*
+        * acquire tlock of the leaf page containing original entry
+        */
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+               xtlck = (struct xtlock *) & tlck->lock;
+       }
+
+       xad = &p->xad[index0];
+       xflag = xad->flag;
+       xoff = offsetXAD(xad);
+       xlen = lengthXAD(xad);
+       xaddr = addressXAD(xad);
+
+       /* nXAD must be completely contained within XAD */
+       if ((xoff > nxoff) ||
+           (nxoff + nxlen > xoff + xlen)) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb,
+                         "nXAD in not completely contained within XAD\n");
+               return -EIO;
+       }
+
+       index = index0;
+       newindex = index + 1;
+       nextindex = le16_to_cpu(p->header.nextindex);
+
+#ifdef  _JFS_WIP_NOCOALESCE
+       if (xoff < nxoff)
+               goto updateRight;
+
+       /*
+        * replace XAD with nXAD
+        */
+      replace:                 /* (nxoff == xoff) */
+       if (nxlen == xlen) {
+               /* replace XAD with nXAD:recorded */
+               *xad = *nxad;
+               xad->flag = xflag & ~XAD_NOTRECORDED;
+
+               goto out;
+       } else                  /* (nxlen < xlen) */
+               goto updateLeft;
+#endif                         /* _JFS_WIP_NOCOALESCE */
+
+/* #ifdef _JFS_WIP_COALESCE */
+       if (xoff < nxoff)
+               goto coalesceRight;
+
+       /*
+        * coalesce with left XAD
+        */
+//coalesceLeft: /* (xoff == nxoff) */
+       /* is XAD first entry of page ? */
+       if (index == XTENTRYSTART)
+               goto replace;
+
+       /* is nXAD logically and physically contiguous with lXAD ? */
+       lxad = &p->xad[index - 1];
+       lxlen = lengthXAD(lxad);
+       if (!(lxad->flag & XAD_NOTRECORDED) &&
+           (nxoff == offsetXAD(lxad) + lxlen) &&
+           (nxaddr == addressXAD(lxad) + lxlen) &&
+           (lxlen + nxlen < MAXXLEN)) {
+               /* extend right lXAD */
+               index0 = index - 1;
+               XADlength(lxad, lxlen + nxlen);
+
+               /* If we just merged two extents together, need to make sure the
+                * right extent gets logged.  If the left one is marked XAD_NEW,
+                * then we know it will be logged.  Otherwise, mark as
+                * XAD_EXTENDED
+                */
+               if (!(lxad->flag & XAD_NEW))
+                       lxad->flag |= XAD_EXTENDED;
+
+               if (xlen > nxlen) {
+                       /* truncate XAD */
+                       XADoffset(xad, xoff + nxlen);
+                       XADlength(xad, xlen - nxlen);
+                       XADaddress(xad, xaddr + nxlen);
+                       goto out;
+               } else {        /* (xlen == nxlen) */
+
+                       /* remove XAD */
+                       if (index < nextindex - 1)
+                               memmove(&p->xad[index], &p->xad[index + 1],
+                                       (nextindex - index -
+                                        1) << L2XTSLOTSIZE);
+
+                       p->header.nextindex =
+                           cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+                                       1);
+
+                       index = index0;
+                       newindex = index + 1;
+                       nextindex = le16_to_cpu(p->header.nextindex);
+                       xoff = nxoff = offsetXAD(lxad);
+                       xlen = nxlen = lxlen + nxlen;
+                       xaddr = nxaddr = addressXAD(lxad);
+                       goto coalesceRight;
+               }
+       }
+
+       /*
+        * replace XAD with nXAD
+        */
+      replace:                 /* (nxoff == xoff) */
+       if (nxlen == xlen) {
+               /* replace XAD with nXAD:recorded */
+               *xad = *nxad;
+               xad->flag = xflag & ~XAD_NOTRECORDED;
+
+               goto coalesceRight;
+       } else                  /* (nxlen < xlen) */
+               goto updateLeft;
+
+       /*
+        * coalesce with right XAD
+        */
+      coalesceRight:           /* (xoff <= nxoff) */
+       /* is XAD last entry of page ? */
+       if (newindex == nextindex) {
+               if (xoff == nxoff)
+                       goto out;
+               goto updateRight;
+       }
+
+       /* is nXAD logically and physically contiguous with rXAD ? */
+       rxad = &p->xad[index + 1];
+       rxlen = lengthXAD(rxad);
+       if (!(rxad->flag & XAD_NOTRECORDED) &&
+           (nxoff + nxlen == offsetXAD(rxad)) &&
+           (nxaddr + nxlen == addressXAD(rxad)) &&
+           (rxlen + nxlen < MAXXLEN)) {
+               /* extend left rXAD */
+               XADoffset(rxad, nxoff);
+               XADlength(rxad, rxlen + nxlen);
+               XADaddress(rxad, nxaddr);
+
+               /* If we just merged two extents together, need to make sure
+                * the left extent gets logged.  If the right one is marked
+                * XAD_NEW, then we know it will be logged.  Otherwise, mark as
+                * XAD_EXTENDED
+                */
+               if (!(rxad->flag & XAD_NEW))
+                       rxad->flag |= XAD_EXTENDED;
+
+               if (xlen > nxlen)
+                       /* truncate XAD */
+                       XADlength(xad, xlen - nxlen);
+               else {          /* (xlen == nxlen) */
+
+                       /* remove XAD */
+                       memmove(&p->xad[index], &p->xad[index + 1],
+                               (nextindex - index - 1) << L2XTSLOTSIZE);
+
+                       p->header.nextindex =
+                           cpu_to_le16(le16_to_cpu(p->header.nextindex) -
+                                       1);
+               }
+
+               goto out;
+       } else if (xoff == nxoff)
+               goto out;
+
+       if (xoff >= nxoff) {
+               XT_PUTPAGE(mp);
+               jfs_error(ip->i_sb, "xoff >= nxoff\n");
+               return -EIO;
+       }
+/* #endif _JFS_WIP_COALESCE */
+
+       /*
+        * split XAD into (lXAD, nXAD):
+        *
+        *          |---nXAD--->
+        * --|----------XAD----------|--
+        *   |-lXAD-|
+        */
+      updateRight:             /* (xoff < nxoff) */
+       /* truncate old XAD as lXAD:not_recorded */
+       xad = &p->xad[index];
+       XADlength(xad, nxoff - xoff);
+
+       /* insert nXAD:recorded */
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+
+               /* xtSpliUp() unpins leaf pages */
+               split.mp = mp;
+               split.index = newindex;
+               split.flag = xflag & ~XAD_NOTRECORDED;
+               split.off = nxoff;
+               split.len = nxlen;
+               split.addr = nxaddr;
+               split.pxdlist = NULL;
+               if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                       return rc;
+
+               /* get back old page */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+               /*
+                * if leaf root has been split, original root has been
+                * copied to new child page, i.e., original entry now
+                * resides on the new child page;
+                */
+               if (p->header.flag & BT_INTERNAL) {
+                       ASSERT(p->header.nextindex ==
+                              cpu_to_le16(XTENTRYSTART + 1));
+                       xad = &p->xad[XTENTRYSTART];
+                       bn = addressXAD(xad);
+                       XT_PUTPAGE(mp);
+
+                       /* get new child page */
+                       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                       if (rc)
+                               return rc;
+
+                       BT_MARK_DIRTY(mp, ip);
+                       if (!test_cflag(COMMIT_Nolink, ip)) {
+                               tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                       }
+               } else {
+                       /* is nXAD on new page ? */
+                       if (newindex >
+                           (le16_to_cpu(p->header.maxentry) >> 1)) {
+                               newindex =
+                                   newindex -
+                                   le16_to_cpu(p->header.nextindex) +
+                                   XTENTRYSTART;
+                               newpage = 1;
+                       }
+               }
+       } else {
+               /* if insert into middle, shift right remaining entries */
+               if (newindex < nextindex)
+                       memmove(&p->xad[newindex + 1], &p->xad[newindex],
+                               (nextindex - newindex) << L2XTSLOTSIZE);
+
+               /* insert the entry */
+               xad = &p->xad[newindex];
+               *xad = *nxad;
+               xad->flag = xflag & ~XAD_NOTRECORDED;
+
+               /* advance next available entry index. */
+               p->header.nextindex =
+                   cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+       }
+
+       /*
+        * does nXAD force 3-way split ?
+        *
+        *          |---nXAD--->|
+        * --|----------XAD-------------|--
+        *   |-lXAD-|           |-rXAD -|
+        */
+       if (nxoff + nxlen == xoff + xlen)
+               goto out;
+
+       /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
+       if (newpage) {
+               /* close out old page */
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                           min(index0, (int)xtlck->lwm.offset) : index0;
+                       xtlck->lwm.length =
+                           le16_to_cpu(p->header.nextindex) -
+                           xtlck->lwm.offset;
+               }
+
+               bn = le64_to_cpu(p->header.next);
+               XT_PUTPAGE(mp);
+
+               /* get new right page */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               BT_MARK_DIRTY(mp, ip);
+               if (!test_cflag(COMMIT_Nolink, ip)) {
+                       tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+                       xtlck = (struct xtlock *) & tlck->lock;
+               }
+
+               index0 = index = newindex;
+       } else
+               index++;
+
+       newindex = index + 1;
+       nextindex = le16_to_cpu(p->header.nextindex);
+       xlen = xlen - (nxoff - xoff);
+       xoff = nxoff;
+       xaddr = nxaddr;
+
+       /* recompute split pages */
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+               XT_PUTPAGE(mp);
+
+               if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT)))
+                       return rc;
+
+               /* retrieve search result */
+               XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
+
+               if (cmp != 0) {
+                       XT_PUTPAGE(mp);
+                       jfs_error(ip->i_sb, "xtSearch failed\n");
+                       return -EIO;
+               }
+
+               if (index0 != index) {
+                       XT_PUTPAGE(mp);
+                       jfs_error(ip->i_sb, "unexpected value of index\n");
+                       return -EIO;
+               }
+       }
+
+       /*
+        * split XAD into (nXAD, rXAD)
+        *
+        *          ---nXAD---|
+        * --|----------XAD----------|--
+        *                    |-rXAD-|
+        */
+      updateLeft:              /* (nxoff == xoff) && (nxlen < xlen) */
+       /* update old XAD with nXAD:recorded */
+       xad = &p->xad[index];
+       *xad = *nxad;
+       xad->flag = xflag & ~XAD_NOTRECORDED;
+
+       /* insert rXAD:not_recorded */
+       xoff = xoff + nxlen;
+       xlen = xlen - nxlen;
+       xaddr = xaddr + nxlen;
+       if (nextindex == le16_to_cpu(p->header.maxentry)) {
+/*
+printf("xtUpdate.updateLeft.split p:0x%p\n", p);
+*/
+               /* xtSpliUp() unpins leaf pages */
+               split.mp = mp;
+               split.index = newindex;
+               split.flag = xflag;
+               split.off = xoff;
+               split.len = xlen;
+               split.addr = xaddr;
+               split.pxdlist = NULL;
+               if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
+                       return rc;
+
+               /* get back old page */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               /*
+                * if leaf root has been split, original root has been
+                * copied to new child page, i.e., original entry now
+                * resides on the new child page;
+                */
+               if (p->header.flag & BT_INTERNAL) {
+                       ASSERT(p->header.nextindex ==
+                              cpu_to_le16(XTENTRYSTART + 1));
+                       xad = &p->xad[XTENTRYSTART];
+                       bn = addressXAD(xad);
+                       XT_PUTPAGE(mp);
+
+                       /* get new child page */
+                       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+                       if (rc)
+                               return rc;
+
+                       BT_MARK_DIRTY(mp, ip);
+                       if (!test_cflag(COMMIT_Nolink, ip)) {
+                               tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                       }
+               }
+       } else {
+               /* if insert into middle, shift right remaining entries */
+               if (newindex < nextindex)
+                       memmove(&p->xad[newindex + 1], &p->xad[newindex],
+                               (nextindex - newindex) << L2XTSLOTSIZE);
+
+               /* insert the entry */
+               xad = &p->xad[newindex];
+               XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+               /* advance next available entry index. */
+               p->header.nextindex =
+                   cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+       }
+
+      out:
+       if (!test_cflag(COMMIT_Nolink, ip)) {
+               xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                   min(index0, (int)xtlck->lwm.offset) : index0;
+               xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+                   xtlck->lwm.offset;
+       }
+
+       /* unpin the leaf page */
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+
+
+/*
+ *     xtAppend()
+ *
+ * function: grow in append mode from contiguous region specified ;
+ *
+ * parameter:
+ *     tid             - transaction id;
+ *     ip              - file object;
+ *     xflag           - extent flag:
+ *     xoff            - extent offset;
+ *     maxblocks       - max extent length;
+ *     xlen            - extent length (in/out);
+ *     xaddrp          - extent address pointer (in/out):
+ *     flag            -
+ *
+ * return:
+ */
+int xtAppend(tid_t tid,                /* transaction id */
+            struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
+            s32 * xlenp,       /* (in/out) */
+            s64 * xaddrp,      /* (in/out) */
+            int flag)
+{
+       int rc = 0;
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* base B+-tree index page */
+       s64 bn, xaddr;
+       int index, nextindex;
+       struct btstack btstack; /* traverse stack */
+       struct xtsplit split;   /* split information */
+       xad_t *xad;
+       int cmp;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+       int nsplit, nblocks, xlen;
+       struct pxdlist pxdlist;
+       pxd_t *pxd;
+       s64 next;
+
+       xaddr = *xaddrp;
+       xlen = *xlenp;
+       jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
+                (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
+
+       /*
+        *      search for the entry location at which to insert:
+        *
+        * xtFastSearch() and xtSearch() both returns (leaf page
+        * pinned, index at which to insert).
+        * n.b. xtSearch() may return index of maxentry of
+        * the full page.
+        */
+       if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT)))
+               return rc;
+
+       /* retrieve search result */
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+       if (cmp == 0) {
+               rc = -EEXIST;
+               goto out;
+       }
+
+       if (next)
+               xlen = min(xlen, (int)(next - xoff));
+//insert:
+       /*
+        *      insert entry for new extent
+        */
+       xflag |= XAD_NEW;
+
+       /*
+        *      if the leaf page is full, split the page and
+        *      propagate up the router entry for the new page from split
+        *
+        * The xtSplitUp() will insert the entry and unpin the leaf page.
+        */
+       nextindex = le16_to_cpu(p->header.nextindex);
+       if (nextindex < le16_to_cpu(p->header.maxentry))
+               goto insertLeaf;
+
+       /*
+        * allocate new index blocks to cover index page split(s)
+        */
+       nsplit = btstack.nsplit;
+       split.pxdlist = &pxdlist;
+       pxdlist.maxnpxd = pxdlist.npxd = 0;
+       pxd = &pxdlist.pxd[0];
+       nblocks = JFS_SBI(ip->i_sb)->nbperpage;
+       for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
+               if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
+                       PXDaddress(pxd, xaddr);
+                       PXDlength(pxd, nblocks);
+
+                       pxdlist.maxnpxd++;
+
+                       continue;
+               }
+
+               /* undo allocation */
+
+               goto out;
+       }
+
+       xlen = min(xlen, maxblocks);
+
+       /*
+        * allocate data extent requested
+        */
+       if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+               goto out;
+
+       split.mp = mp;
+       split.index = index;
+       split.flag = xflag;
+       split.off = xoff;
+       split.len = xlen;
+       split.addr = xaddr;
+       if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
+               /* undo data extent allocation */
+               dbFree(ip, *xaddrp, (s64) * xlenp);
+
+               return rc;
+       }
+
+       *xaddrp = xaddr;
+       *xlenp = xlen;
+       return 0;
+
+       /*
+        *      insert the new entry into the leaf page
+        */
+      insertLeaf:
+       /*
+        * allocate data extent requested
+        */
+       if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
+               goto out;
+
+       BT_MARK_DIRTY(mp, ip);
+       /*
+        * acquire a transaction lock on the leaf page;
+        *
+        * action: xad insertion/extension;
+        */
+       tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
+       xtlck = (struct xtlock *) & tlck->lock;
+
+       /* insert the new entry: mark the entry NEW */
+       xad = &p->xad[index];
+       XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
+
+       /* advance next available entry index */
+       le16_add_cpu(&p->header.nextindex, 1);
+
+       xtlck->lwm.offset =
+           (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
+       xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
+           xtlck->lwm.offset;
+
+       *xaddrp = xaddr;
+       *xlenp = xlen;
+
+      out:
+       /* unpin the leaf page */
+       XT_PUTPAGE(mp);
+
+       return rc;
+}
+#ifdef _STILL_TO_PORT
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *     xtDelete()
+ *
+ * function:
+ *     delete the entry with the specified key.
+ *
+ *     N.B.: whole extent of the entry is assumed to be deleted.
+ *
+ * parameter:
+ *
+ * return:
+ *     ENOENT: if the entry is not found.
+ *
+ * exception:
+ */
+int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
+{
+       int rc = 0;
+       struct btstack btstack;
+       int cmp;
+       s64 bn;
+       struct metapage *mp;
+       xtpage_t *p;
+       int index, nextindex;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+
+       /*
+        * find the matching entry; xtSearch() pins the page
+        */
+       if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+               return rc;
+
+       XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+       if (cmp) {
+               /* unpin the leaf page */
+               XT_PUTPAGE(mp);
+               return -ENOENT;
+       }
+
+       /*
+        * delete the entry from the leaf page
+        */
+       nextindex = le16_to_cpu(p->header.nextindex);
+       le16_add_cpu(&p->header.nextindex, -1);
+
+       /*
+        * if the leaf page bocome empty, free the page
+        */
+       if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
+               return (xtDeleteUp(tid, ip, mp, p, &btstack));
+
+       BT_MARK_DIRTY(mp, ip);
+       /*
+        * acquire a transaction lock on the leaf page;
+        *
+        * action:xad deletion;
+        */
+       tlck = txLock(tid, ip, mp, tlckXTREE);
+       xtlck = (struct xtlock *) & tlck->lock;
+       xtlck->lwm.offset =
+           (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
+
+       /* if delete from middle, shift left/compact the remaining entries */
+       if (index < nextindex - 1)
+               memmove(&p->xad[index], &p->xad[index + 1],
+                       (nextindex - index - 1) * sizeof(xad_t));
+
+       XT_PUTPAGE(mp);
+
+       return 0;
+}
+
+
+/* - TBD for defragmentaion/reorganization -
+ *
+ *     xtDeleteUp()
+ *
+ * function:
+ *     free empty pages as propagating deletion up the tree
+ *
+ * parameter:
+ *
+ * return:
+ */
+static int
+xtDeleteUp(tid_t tid, struct inode *ip,
+          struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
+{
+       int rc = 0;
+       struct metapage *mp;
+       xtpage_t *p;
+       int index, nextindex;
+       s64 xaddr;
+       int xlen;
+       struct btframe *parent;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+
+       /*
+        * keep root leaf page which has become empty
+        */
+       if (fp->header.flag & BT_ROOT) {
+               /* keep the root page */
+               fp->header.flag &= ~BT_INTERNAL;
+               fp->header.flag |= BT_LEAF;
+               fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+               /* XT_PUTPAGE(fmp); */
+
+               return 0;
+       }
+
+       /*
+        * free non-root leaf page
+        */
+       if ((rc = xtRelink(tid, ip, fp))) {
+               XT_PUTPAGE(fmp);
+               return rc;
+       }
+
+       xaddr = addressPXD(&fp->header.self);
+       xlen = lengthPXD(&fp->header.self);
+       /* free the page extent */
+       dbFree(ip, xaddr, (s64) xlen);
+
+       /* free the buffer page */
+       discard_metapage(fmp);
+
+       /*
+        * propagate page deletion up the index tree
+        *
+        * If the delete from the parent page makes it empty,
+        * continue all the way up the tree.
+        * stop if the root page is reached (which is never deleted) or
+        * if the entry deletion does not empty the page.
+        */
+       while ((parent = BT_POP(btstack)) != NULL) {
+               /* get/pin the parent page <sp> */
+               XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               index = parent->index;
+
+               /* delete the entry for the freed child page from parent.
+                */
+               nextindex = le16_to_cpu(p->header.nextindex);
+
+               /*
+                * the parent has the single entry being deleted:
+                * free the parent page which has become empty.
+                */
+               if (nextindex == 1) {
+                       if (p->header.flag & BT_ROOT) {
+                               /* keep the root page */
+                               p->header.flag &= ~BT_INTERNAL;
+                               p->header.flag |= BT_LEAF;
+                               p->header.nextindex =
+                                   cpu_to_le16(XTENTRYSTART);
+
+                               /* XT_PUTPAGE(mp); */
+
+                               break;
+                       } else {
+                               /* free the parent page */
+                               if ((rc = xtRelink(tid, ip, p)))
+                                       return rc;
+
+                               xaddr = addressPXD(&p->header.self);
+                               /* free the page extent */
+                               dbFree(ip, xaddr,
+                                      (s64) JFS_SBI(ip->i_sb)->nbperpage);
+
+                               /* unpin/free the buffer page */
+                               discard_metapage(mp);
+
+                               /* propagate up */
+                               continue;
+                       }
+               }
+               /*
+                * the parent has other entries remaining:
+                * delete the router entry from the parent page.
+                */
+               else {
+                       BT_MARK_DIRTY(mp, ip);
+                       /*
+                        * acquire a transaction lock on the leaf page;
+                        *
+                        * action:xad deletion;
+                        */
+                       tlck = txLock(tid, ip, mp, tlckXTREE);
+                       xtlck = (struct xtlock *) & tlck->lock;
+                       xtlck->lwm.offset =
+                           (xtlck->lwm.offset) ? min(index,
+                                                     xtlck->lwm.
+                                                     offset) : index;
+
+                       /* if delete from middle,
+                        * shift left/compact the remaining entries in the page
+                        */
+                       if (index < nextindex - 1)
+                               memmove(&p->xad[index], &p->xad[index + 1],
+                                       (nextindex - index -
+                                        1) << L2XTSLOTSIZE);
+
+                       le16_add_cpu(&p->header.nextindex, -1);
+                       jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
+                                (ulong) parent->bn, index);
+               }
+
+               /* unpin the parent page */
+               XT_PUTPAGE(mp);
+
+               /* exit propagation up */
+               break;
+       }
+
+       return 0;
+}
+
+
+/*
+ * NAME:       xtRelocate()
+ *
+ * FUNCTION:   relocate xtpage or data extent of regular file;
+ *             This function is mainly used by defragfs utility.
+ *
+ * NOTE:       This routine does not have the logic to handle
+ *             uncommitted allocated extent. The caller should call
+ *             txCommit() to commit all the allocation before call
+ *             this routine.
+ */
+int
+xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
+          s64 nxaddr,          /* new xaddr */
+          int xtype)
+{                              /* extent type: XTPAGE or DATAEXT */
+       int rc = 0;
+       struct tblock *tblk;
+       struct tlock *tlck;
+       struct xtlock *xtlck;
+       struct metapage *mp, *pmp, *lmp, *rmp;  /* meta-page buffer */
+       xtpage_t *p, *pp, *rp, *lp;     /* base B+-tree index page */
+       xad_t *xad;
+       pxd_t *pxd;
+       s64 xoff, xsize;
+       int xlen;
+       s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
+       cbuf_t *cp;
+       s64 offset, nbytes, nbrd, pno;
+       int nb, npages, nblks;
+       s64 bn;
+       int cmp;
+       int index;
+       struct pxd_lock *pxdlock;
+       struct btstack btstack; /* traverse stack */
+
+       xtype = xtype & EXTENT_TYPE;
+
+       xoff = offsetXAD(oxad);
+       oxaddr = addressXAD(oxad);
+       xlen = lengthXAD(oxad);
+
+       /* validate extent offset */
+       offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+       if (offset >= ip->i_size)
+               return -ESTALE; /* stale extent */
+
+       jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
+                xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
+
+       /*
+        *      1. get and validate the parent xtpage/xad entry
+        *      covering the source extent to be relocated;
+        */
+       if (xtype == DATAEXT) {
+               /* search in leaf entry */
+               rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+               if (rc)
+                       return rc;
+
+               /* retrieve search result */
+               XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+               if (cmp) {
+                       XT_PUTPAGE(pmp);
+                       return -ESTALE;
+               }
+
+               /* validate for exact match with a single entry */
+               xad = &pp->xad[index];
+               if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
+                       XT_PUTPAGE(pmp);
+                       return -ESTALE;
+               }
+       } else {                /* (xtype == XTPAGE) */
+
+               /* search in internal entry */
+               rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
+               if (rc)
+                       return rc;
+
+               /* retrieve search result */
+               XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+
+               if (cmp) {
+                       XT_PUTPAGE(pmp);
+                       return -ESTALE;
+               }
+
+               /* xtSearchNode() validated for exact match with a single entry
+                */
+               xad = &pp->xad[index];
+       }
+       jfs_info("xtRelocate: parent xad entry validated.");
+
+       /*
+        *      2. relocate the extent
+        */
+       if (xtype == DATAEXT) {
+               /* if the extent is allocated-but-not-recorded
+                * there is no real data to be moved in this extent,
+                */
+               if (xad->flag & XAD_NOTRECORDED)
+                       goto out;
+               else
+                       /* release xtpage for cmRead()/xtLookup() */
+                       XT_PUTPAGE(pmp);
+
+               /*
+                *      cmRelocate()
+                *
+                * copy target data pages to be relocated;
+                *
+                * data extent must start at page boundary and
+                * multiple of page size (except the last data extent);
+                * read in each page of the source data extent into cbuf,
+                * update the cbuf extent descriptor of the page to be
+                * homeward bound to new dst data extent
+                * copy the data from the old extent to new extent.
+                * copy is essential for compressed files to avoid problems
+                * that can arise if there was a change in compression
+                * algorithms.
+                * it is a good strategy because it may disrupt cache
+                * policy to keep the pages in memory afterwards.
+                */
+               offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
+               assert((offset & CM_OFFSET) == 0);
+               nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+               pno = offset >> CM_L2BSIZE;
+               npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
+/*
+               npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
+                         (offset >> CM_L2BSIZE) + 1;
+*/
+               sxaddr = oxaddr;
+               dxaddr = nxaddr;
+
+               /* process the request one cache buffer at a time */
+               for (nbrd = 0; nbrd < nbytes; nbrd += nb,
+                    offset += nb, pno++, npages--) {
+                       /* compute page size */
+                       nb = min(nbytes - nbrd, CM_BSIZE);
+
+                       /* get the cache buffer of the page */
+                       if (rc = cmRead(ip, offset, npages, &cp))
+                               break;
+
+                       assert(addressPXD(&cp->cm_pxd) == sxaddr);
+                       assert(!cp->cm_modified);
+
+                       /* bind buffer with the new extent address */
+                       nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
+                       cmSetXD(ip, cp, pno, dxaddr, nblks);
+
+                       /* release the cbuf, mark it as modified */
+                       cmPut(cp, true);
+
+                       dxaddr += nblks;
+                       sxaddr += nblks;
+               }
+
+               /* get back parent page */
+               if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
+                       return rc;
+
+               XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
+               jfs_info("xtRelocate: target data extent relocated.");
+       } else {                /* (xtype == XTPAGE) */
+
+               /*
+                * read in the target xtpage from the source extent;
+                */
+               XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
+               if (rc) {
+                       XT_PUTPAGE(pmp);
+                       return rc;
+               }
+
+               /*
+                * read in sibling pages if any to update sibling pointers;
+                */
+               rmp = NULL;
+               if (p->header.next) {
+                       nextbn = le64_to_cpu(p->header.next);
+                       XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
+                       if (rc) {
+                               XT_PUTPAGE(pmp);
+                               XT_PUTPAGE(mp);
+                               return (rc);
+                       }
+               }
+
+               lmp = NULL;
+               if (p->header.prev) {
+                       prevbn = le64_to_cpu(p->header.prev);
+                       XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
+                       if (rc) {
+                               XT_PUTPAGE(pmp);
+                               XT_PUTPAGE(mp);
+                               if (rmp)
+                                       XT_PUTPAGE(rmp);
+                               return (rc);
+                       }
+               }
+
+               /* at this point, all xtpages to be updated are in memory */
+
+               /*
+                * update sibling pointers of sibling xtpages if any;
+                */
+               if (lmp) {
+                       BT_MARK_DIRTY(lmp, ip);
+                       tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
+                       lp->header.next = cpu_to_le64(nxaddr);
+                       XT_PUTPAGE(lmp);
+               }
+
+               if (rmp) {
+                       BT_MARK_DIRTY(rmp, ip);
+                       tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
+                       rp->header.prev = cpu_to_le64(nxaddr);
+                       XT_PUTPAGE(rmp);
+               }
+
+               /*
+                * update the target xtpage to be relocated
+                *
+                * update the self address of the target page
+                * and write to destination extent;
+                * redo image covers the whole xtpage since it is new page
+                * to the destination extent;
+                * update of bmap for the free of source extent
+                * of the target xtpage itself:
+                * update of bmap for the allocation of destination extent
+                * of the target xtpage itself:
+                * update of bmap for the extents covered by xad entries in
+                * the target xtpage is not necessary since they are not
+                * updated;
+                * if not committed before this relocation,
+                * target page may contain XAD_NEW entries which must
+                * be scanned for bmap update (logredo() always
+                * scan xtpage REDOPAGE image for bmap update);
+                * if committed before this relocation (tlckRELOCATE),
+                * scan may be skipped by commit() and logredo();
+                */
+               BT_MARK_DIRTY(mp, ip);
+               /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
+               xtlck = (struct xtlock *) & tlck->lock;
+
+               /* update the self address in the xtpage header */
+               pxd = &p->header.self;
+               PXDaddress(pxd, nxaddr);
+
+               /* linelock for the after image of the whole page */
+               xtlck->lwm.length =
+                   le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
+
+               /* update the buffer extent descriptor of target xtpage */
+               xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
+               bmSetXD(mp, nxaddr, xsize);
+
+               /* unpin the target page to new homeward bound */
+               XT_PUTPAGE(mp);
+               jfs_info("xtRelocate: target xtpage relocated.");
+       }
+
+       /*
+        *      3. acquire maplock for the source extent to be freed;
+        *
+        * acquire a maplock saving the src relocated extent address;
+        * to free of the extent at commit time;
+        */
+      out:
+       /* if DATAEXT relocation, write a LOG_UPDATEMAP record for
+        * free PXD of the source data extent (logredo() will update
+        * bmap for free of source data extent), and update bmap for
+        * free of the source data extent;
+        */
+       if (xtype == DATAEXT)
+               tlck = txMaplock(tid, ip, tlckMAP);
+       /* if XTPAGE relocation, write a LOG_NOREDOPAGE record
+        * for the source xtpage (logredo() will init NoRedoPage
+        * filter and will also update bmap for free of the source
+        * xtpage), and update bmap for free of the source xtpage;
+        * N.B. We use tlckMAP instead of tlkcXTREE because there
+        *      is no buffer associated with this lock since the buffer
+        *      has been redirected to the target location.
+        */
+       else                    /* (xtype == XTPAGE) */
+               tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
+
+       pxdlock = (struct pxd_lock *) & tlck->lock;
+       pxdlock->flag = mlckFREEPXD;
+       PXDaddress(&pxdlock->pxd, oxaddr);
+       PXDlength(&pxdlock->pxd, xlen);
+       pxdlock->index = 1;
+
+       /*
+        *      4. update the parent xad entry for relocation;
+        *
+        * acquire tlck for the parent entry with XAD_NEW as entry
+        * update which will write LOG_REDOPAGE and update bmap for
+        * allocation of XAD_NEW destination extent;
+        */
+       jfs_info("xtRelocate: update parent xad entry.");
+       BT_MARK_DIRTY(pmp, ip);
+       tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
+       xtlck = (struct xtlock *) & tlck->lock;
+
+       /* update the XAD with the new destination extent; */
+       xad = &pp->xad[index];
+       xad->flag |= XAD_NEW;
+       XADaddress(xad, nxaddr);
+
+       xtlck->lwm.offset = min(index, xtlck->lwm.offset);
+       xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
+           xtlck->lwm.offset;
+
+       /* unpin the parent xtpage */
+       XT_PUTPAGE(pmp);
+
+       return rc;
+}
+
+
+/*
+ *     xtSearchNode()
+ *
+ * function:   search for the internal xad entry covering specified extent.
+ *             This function is mainly used by defragfs utility.
+ *
+ * parameters:
+ *     ip      - file object;
+ *     xad     - extent to find;
+ *     cmpp    - comparison result:
+ *     btstack - traverse stack;
+ *     flag    - search process flag;
+ *
+ * returns:
+ *     btstack contains (bn, index) of search path traversed to the entry.
+ *     *cmpp is set to result of comparison with the entry returned.
+ *     the page containing the entry is pinned at exit.
+ */
+static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
+                       int *cmpp, struct btstack * btstack, int flag)
+{
+       int rc = 0;
+       s64 xoff, xaddr;
+       int xlen;
+       int cmp = 1;            /* init for empty page */
+       s64 bn;                 /* block number */
+       struct metapage *mp;    /* meta-page buffer */
+       xtpage_t *p;            /* page */
+       int base, index, lim;
+       struct btframe *btsp;
+       s64 t64;
+
+       BT_CLR(btstack);
+
+       xoff = offsetXAD(xad);
+       xlen = lengthXAD(xad);
+       xaddr = addressXAD(xad);
+
+       /*
+        *      search down tree from root:
+        *
+        * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+        * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+        *
+        * if entry with search key K is not found
+        * internal page search find the entry with largest key Ki
+        * less than K which point to the child page to search;
+        * leaf page search find the entry with smallest key Kj
+        * greater than K so that the returned index is the position of
+        * the entry to be shifted right for insertion of new entry.
+        * for empty tree, search key is greater than any key of the tree.
+        *
+        * by convention, root bn = 0.
+        */
+       for (bn = 0;;) {
+               /* get/pin the page to search */
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+               if (p->header.flag & BT_LEAF) {
+                       XT_PUTPAGE(mp);
+                       return -ESTALE;
+               }
+
+               lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+
+               /*
+                * binary search with search key K on the current page
+                */
+               for (base = XTENTRYSTART; lim; lim >>= 1) {
+                       index = base + (lim >> 1);
+
+                       XT_CMP(cmp, xoff, &p->xad[index], t64);
+                       if (cmp == 0) {
+                               /*
+                                *      search hit
+                                *
+                                * verify for exact match;
+                                */
+                               if (xaddr == addressXAD(&p->xad[index]) &&
+                                   xoff == offsetXAD(&p->xad[index])) {
+                                       *cmpp = cmp;
+
+                                       /* save search result */
+                                       btsp = btstack->top;
+                                       btsp->bn = bn;
+                                       btsp->index = index;
+                                       btsp->mp = mp;
+
+                                       return 0;
+                               }
+
+                               /* descend/search its child page */
+                               goto next;
+                       }
+
+                       if (cmp > 0) {
+                               base = index + 1;
+                               --lim;
+                       }
+               }
+
+               /*
+                *      search miss - non-leaf page:
+                *
+                * base is the smallest index with key (Kj) greater than
+                * search key (K) and may be zero or maxentry index.
+                * if base is non-zero, decrement base by one to get the parent
+                * entry of the child page to search.
+                */
+               index = base ? base - 1 : base;
+
+               /*
+                * go down to child page
+                */
+             next:
+               /* get the child page block number */
+               bn = addressXAD(&p->xad[index]);
+
+               /* unpin the parent page */
+               XT_PUTPAGE(mp);
+       }
+}
+
+
+/*
+ *     xtRelink()
+ *
+ * function:
+ *     link around a freed page.
+ *
+ * Parameter:
+ *     int             tid,
+ *     struct inode    *ip,
+ *     xtpage_t        *p)
+ *
+ * returns:
+ */
+static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
+{
+       int rc = 0;
+       struct metapage *mp;
+       s64 nextbn, prevbn;
+       struct tlock *tlck;
+
+       nextbn = le64_to_cpu(p->header.next);
+       prevbn = le64_to_cpu(p->header.prev);
+
+       /* update prev pointer of the next page */
+       if (nextbn != 0) {
+               XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               /*
+                * acquire a transaction lock on the page;
+                *
+                * action: update prev pointer;
+                */
+               BT_MARK_DIRTY(mp, ip);
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+               /* the page may already have been tlock'd */
+
+               p->header.prev = cpu_to_le64(prevbn);
+
+               XT_PUTPAGE(mp);
+       }
+
+       /* update next pointer of the previous page */
+       if (prevbn != 0) {
+               XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               /*
+                * acquire a transaction lock on the page;
+                *
+                * action: update next pointer;
+                */
+               BT_MARK_DIRTY(mp, ip);
+               tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
+
+               /* the page may already have been tlock'd */
+
+               p->header.next = le64_to_cpu(nextbn);
+
+               XT_PUTPAGE(mp);
+       }
+
+       return 0;
+}
+#endif                         /*  _STILL_TO_PORT */
+
+
+/*
+ *     xtInitRoot()
+ *
+ * initialize file root (inline in inode)
+ */
+void xtInitRoot(tid_t tid, struct inode *ip)
+{
+       xtpage_t *p;
+
+       /*
+        * acquire a transaction lock on the root
+        *
+        * action:
+        */
+       txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
+                     tlckXTREE | tlckNEW);
+       p = &JFS_IP(ip)->i_xtroot;
+
+       p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
+       p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+       if (S_ISDIR(ip->i_mode))
+               p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
+       else {
+               p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
+               ip->i_size = 0;
+       }
+
+
+       return;
+}
+
+
+/*
+ * We can run into a deadlock truncating a file with a large number of
+ * xtree pages (large fragmented file).  A robust fix would entail a
+ * reservation system where we would reserve a number of metadata pages
+ * and tlocks which we would be guaranteed without a deadlock.  Without
+ * this, a partial fix is to limit number of metadata pages we will lock
+ * in a single transaction.  Currently we will truncate the file so that
+ * no more than 50 leaf pages will be locked.  The caller of xtTruncate
+ * will be responsible for ensuring that the current transaction gets
+ * committed, and that subsequent transactions are created to truncate
+ * the file further if needed.
+ */
+#define MAX_TRUNCATE_LEAVES 50
+
+/*
+ *     xtTruncate()
+ *
+ * function:
+ *     traverse for truncation logging backward bottom up;
+ *     terminate at the last extent entry at the current subtree
+ *     root page covering new down size.
+ *     truncation may occur within the last extent entry.
+ *
+ * parameter:
+ *     int             tid,
+ *     struct inode    *ip,
+ *     s64             newsize,
+ *     int             type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
+ *
+ * return:
+ *
+ * note:
+ *     PWMAP:
+ *      1. truncate (non-COMMIT_NOLINK file)
+ *         by jfs_truncate() or jfs_open(O_TRUNC):
+ *         xtree is updated;
+ *      2. truncate index table of directory when last entry removed
+ *     map update via tlock at commit time;
+ *     PMAP:
+ *      Call xtTruncate_pmap instead
+ *     WMAP:
+ *      1. remove (free zero link count) on last reference release
+ *         (pmap has been freed at commit zero link count);
+ *      2. truncate (COMMIT_NOLINK file, i.e., tmp file):
+ *         xtree is updated;
+ *      map update directly at truncation time;
+ *
+ *     if (DELETE)
+ *             no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
+ *     else if (TRUNCATE)
+ *             must write LOG_NOREDOPAGE for deleted index page;
+ *
+ * pages may already have been tlocked by anonymous transactions
+ * during file growth (i.e., write) before truncation;
+ *
+ * except last truncated entry, deleted entries remains as is
+ * in the page (nextindex is updated) for other use
+ * (e.g., log/update allocation map): this avoid copying the page
+ * info but delay free of pages;
+ *
+ */
+s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
+{
+       int rc = 0;
+       s64 teof;
+       struct metapage *mp;
+       xtpage_t *p;
+       s64 bn;
+       int index, nextindex;
+       xad_t *xad;
+       s64 xoff, xaddr;
+       int xlen, len, freexlen;
+       struct btstack btstack;
+       struct btframe *parent;
+       struct tblock *tblk = NULL;
+       struct tlock *tlck = NULL;
+       struct xtlock *xtlck = NULL;
+       struct xdlistlock xadlock;      /* maplock for COMMIT_WMAP */
+       struct pxd_lock *pxdlock;               /* maplock for COMMIT_WMAP */
+       s64 nfreed;
+       int freed, log;
+       int locked_leaves = 0;
+
+       /* save object truncation type */
+       if (tid) {
+               tblk = tid_to_tblock(tid);
+               tblk->xflag |= flag;
+       }
+
+       nfreed = 0;
+
+       flag &= COMMIT_MAP;
+       assert(flag != COMMIT_PMAP);
+
+       if (flag == COMMIT_PWMAP)
+               log = 1;
+       else {
+               log = 0;
+               xadlock.flag = mlckFREEXADLIST;
+               xadlock.index = 1;
+       }
+
+       /*
+        * if the newsize is not an integral number of pages,
+        * the file between newsize and next page boundary will
+        * be cleared.
+        * if truncating into a file hole, it will cause
+        * a full block to be allocated for the logical block.
+        */
+
+       /*
+        * release page blocks of truncated region <teof, eof>
+        *
+        * free the data blocks from the leaf index blocks.
+        * delete the parent index entries corresponding to
+        * the freed child data/index blocks.
+        * free the index blocks themselves which aren't needed
+        * in new sized file.
+        *
+        * index blocks are updated only if the blocks are to be
+        * retained in the new sized file.
+        * if type is PMAP, the data and index pages are NOT
+        * freed, and the data and index blocks are NOT freed
+        * from working map.
+        * (this will allow continued access of data/index of
+        * temporary file (zerolink count file truncated to zero-length)).
+        */
+       teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
+           JFS_SBI(ip->i_sb)->l2bsize;
+
+       /* clear stack */
+       BT_CLR(&btstack);
+
+       /*
+        * start with root
+        *
+        * root resides in the inode
+        */
+       bn = 0;
+
+       /*
+        * first access of each page:
+        */
+      getPage:
+       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+       if (rc)
+               return rc;
+
+       /* process entries backward from last index */
+       index = le16_to_cpu(p->header.nextindex) - 1;
+
+
+       /* Since this is the rightmost page at this level, and we may have
+        * already freed a page that was formerly to the right, let's make
+        * sure that the next pointer is zero.
+        */
+       if (p->header.next) {
+               if (log)
+                       /*
+                        * Make sure this change to the header is logged.
+                        * If we really truncate this leaf, the flag
+                        * will be changed to tlckTRUNCATE
+                        */
+                       tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
+               BT_MARK_DIRTY(mp, ip);
+               p->header.next = 0;
+       }
+
+       if (p->header.flag & BT_INTERNAL)
+               goto getChild;
+
+       /*
+        *      leaf page
+        */
+       freed = 0;
+
+       /* does region covered by leaf page precede Teof ? */
+       xad = &p->xad[index];
+       xoff = offsetXAD(xad);
+       xlen = lengthXAD(xad);
+       if (teof >= xoff + xlen) {
+               XT_PUTPAGE(mp);
+               goto getParent;
+       }
+
+       /* (re)acquire tlock of the leaf page */
+       if (log) {
+               if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+                       /*
+                        * We need to limit the size of the transaction
+                        * to avoid exhausting pagecache & tlocks
+                        */
+                       XT_PUTPAGE(mp);
+                       newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+                       goto getParent;
+               }
+               tlck = txLock(tid, ip, mp, tlckXTREE);
+               tlck->type = tlckXTREE | tlckTRUNCATE;
+               xtlck = (struct xtlock *) & tlck->lock;
+               xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+       }
+       BT_MARK_DIRTY(mp, ip);
+
+       /*
+        * scan backward leaf page entries
+        */
+       for (; index >= XTENTRYSTART; index--) {
+               xad = &p->xad[index];
+               xoff = offsetXAD(xad);
+               xlen = lengthXAD(xad);
+               xaddr = addressXAD(xad);
+
+               /*
+                * The "data" for a directory is indexed by the block
+                * device's address space.  This metadata must be invalidated
+                * here
+                */
+               if (S_ISDIR(ip->i_mode) && (teof == 0))
+                       invalidate_xad_metapages(ip, *xad);
+               /*
+                * entry beyond eof: continue scan of current page
+                *          xad
+                * ---|---=======------->
+                *   eof
+                */
+               if (teof < xoff) {
+                       nfreed += xlen;
+                       continue;
+               }
+
+               /*
+                * (xoff <= teof): last entry to be deleted from page;
+                * If other entries remain in page: keep and update the page.
+                */
+
+               /*
+                * eof == entry_start: delete the entry
+                *           xad
+                * -------|=======------->
+                *       eof
+                *
+                */
+               if (teof == xoff) {
+                       nfreed += xlen;
+
+                       if (index == XTENTRYSTART)
+                               break;
+
+                       nextindex = index;
+               }
+               /*
+                * eof within the entry: truncate the entry.
+                *          xad
+                * -------===|===------->
+                *          eof
+                */
+               else if (teof < xoff + xlen) {
+                       /* update truncated entry */
+                       len = teof - xoff;
+                       freexlen = xlen - len;
+                       XADlength(xad, len);
+
+                       /* save pxd of truncated extent in tlck */
+                       xaddr += len;
+                       if (log) {      /* COMMIT_PWMAP */
+                               xtlck->lwm.offset = (xtlck->lwm.offset) ?
+                                   min(index, (int)xtlck->lwm.offset) : index;
+                               xtlck->lwm.length = index + 1 -
+                                   xtlck->lwm.offset;
+                               xtlck->twm.offset = index;
+                               pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
+                               pxdlock->flag = mlckFREEPXD;
+                               PXDaddress(&pxdlock->pxd, xaddr);
+                               PXDlength(&pxdlock->pxd, freexlen);
+                       }
+                       /* free truncated extent */
+                       else {  /* COMMIT_WMAP */
+
+                               pxdlock = (struct pxd_lock *) & xadlock;
+                               pxdlock->flag = mlckFREEPXD;
+                               PXDaddress(&pxdlock->pxd, xaddr);
+                               PXDlength(&pxdlock->pxd, freexlen);
+                               txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
+
+                               /* reset map lock */
+                               xadlock.flag = mlckFREEXADLIST;
+                       }
+
+                       /* current entry is new last entry; */
+                       nextindex = index + 1;
+
+                       nfreed += freexlen;
+               }
+               /*
+                * eof beyond the entry:
+                *          xad
+                * -------=======---|--->
+                *                 eof
+                */
+               else {          /* (xoff + xlen < teof) */
+
+                       nextindex = index + 1;
+               }
+
+               if (nextindex < le16_to_cpu(p->header.nextindex)) {
+                       if (!log) {     /* COMMIT_WAMP */
+                               xadlock.xdlist = &p->xad[nextindex];
+                               xadlock.count =
+                                   le16_to_cpu(p->header.nextindex) -
+                                   nextindex;
+                               txFreeMap(ip, (struct maplock *) & xadlock,
+                                         NULL, COMMIT_WMAP);
+                       }
+                       p->header.nextindex = cpu_to_le16(nextindex);
+               }
+
+               XT_PUTPAGE(mp);
+
+               /* assert(freed == 0); */
+               goto getParent;
+       }                       /* end scan of leaf page entries */
+
+       freed = 1;
+
+       /*
+        * leaf page become empty: free the page if type != PMAP
+        */
+       if (log) {              /* COMMIT_PWMAP */
+               /* txCommit() with tlckFREE:
+                * free data extents covered by leaf [XTENTRYSTART:hwm);
+                * invalidate leaf if COMMIT_PWMAP;
+                * if (TRUNCATE), will write LOG_NOREDOPAGE;
+                */
+               tlck->type = tlckXTREE | tlckFREE;
+       } else {                /* COMMIT_WAMP */
+
+               /* free data extents covered by leaf */
+               xadlock.xdlist = &p->xad[XTENTRYSTART];
+               xadlock.count =
+                   le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
+               txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
+       }
+
+       if (p->header.flag & BT_ROOT) {
+               p->header.flag &= ~BT_INTERNAL;
+               p->header.flag |= BT_LEAF;
+               p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+
+               XT_PUTPAGE(mp); /* debug */
+               goto out;
+       } else {
+               if (log) {      /* COMMIT_PWMAP */
+                       /* page will be invalidated at tx completion
+                        */
+                       XT_PUTPAGE(mp);
+               } else {        /* COMMIT_WMAP */
+
+                       if (mp->lid)
+                               lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
+
+                       /* invalidate empty leaf page */
+                       discard_metapage(mp);
+               }
+       }
+
+       /*
+        * the leaf page become empty: delete the parent entry
+        * for the leaf page if the parent page is to be kept
+        * in the new sized file.
+        */
+
+       /*
+        * go back up to the parent page
+        */
+      getParent:
+       /* pop/restore parent entry for the current child page */
+       if ((parent = BT_POP(&btstack)) == NULL)
+               /* current page must have been root */
+               goto out;
+
+       /* get back the parent page */
+       bn = parent->bn;
+       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+       if (rc)
+               return rc;
+
+       index = parent->index;
+
+       /*
+        * child page was not empty:
+        */
+       if (freed == 0) {
+               /* has any entry deleted from parent ? */
+               if (index < le16_to_cpu(p->header.nextindex) - 1) {
+                       /* (re)acquire tlock on the parent page */
+                       if (log) {      /* COMMIT_PWMAP */
+                               /* txCommit() with tlckTRUNCATE:
+                                * free child extents covered by parent [);
+                                */
+                               tlck = txLock(tid, ip, mp, tlckXTREE);
+                               xtlck = (struct xtlock *) & tlck->lock;
+                               if (!(tlck->type & tlckTRUNCATE)) {
+                                       xtlck->hwm.offset =
+                                           le16_to_cpu(p->header.
+                                                       nextindex) - 1;
+                                       tlck->type =
+                                           tlckXTREE | tlckTRUNCATE;
+                               }
+                       } else {        /* COMMIT_WMAP */
+
+                               /* free child extents covered by parent */
+                               xadlock.xdlist = &p->xad[index + 1];
+                               xadlock.count =
+                                   le16_to_cpu(p->header.nextindex) -
+                                   index - 1;
+                               txFreeMap(ip, (struct maplock *) & xadlock,
+                                         NULL, COMMIT_WMAP);
+                       }
+                       BT_MARK_DIRTY(mp, ip);
+
+                       p->header.nextindex = cpu_to_le16(index + 1);
+               }
+               XT_PUTPAGE(mp);
+               goto getParent;
+       }
+
+       /*
+        * child page was empty:
+        */
+       nfreed += lengthXAD(&p->xad[index]);
+
+       /*
+        * During working map update, child page's tlock must be handled
+        * before parent's.  This is because the parent's tlock will cause
+        * the child's disk space to be marked available in the wmap, so
+        * it's important that the child page be released by that time.
+        *
+        * ToDo:  tlocks should be on doubly-linked list, so we can
+        * quickly remove it and add it to the end.
+        */
+
+       /*
+        * Move parent page's tlock to the end of the tid's tlock list
+        */
+       if (log && mp->lid && (tblk->last != mp->lid) &&
+           lid_to_tlock(mp->lid)->tid) {
+               lid_t lid = mp->lid;
+               struct tlock *prev;
+
+               tlck = lid_to_tlock(lid);
+
+               if (tblk->next == lid)
+                       tblk->next = tlck->next;
+               else {
+                       for (prev = lid_to_tlock(tblk->next);
+                            prev->next != lid;
+                            prev = lid_to_tlock(prev->next)) {
+                               assert(prev->next);
+                       }
+                       prev->next = tlck->next;
+               }
+               lid_to_tlock(tblk->last)->next = lid;
+               tlck->next = 0;
+               tblk->last = lid;
+       }
+
+       /*
+        * parent page become empty: free the page
+        */
+       if (index == XTENTRYSTART) {
+               if (log) {      /* COMMIT_PWMAP */
+                       /* txCommit() with tlckFREE:
+                        * free child extents covered by parent;
+                        * invalidate parent if COMMIT_PWMAP;
+                        */
+                       tlck = txLock(tid, ip, mp, tlckXTREE);
+                       xtlck = (struct xtlock *) & tlck->lock;
+                       xtlck->hwm.offset =
+                           le16_to_cpu(p->header.nextindex) - 1;
+                       tlck->type = tlckXTREE | tlckFREE;
+               } else {        /* COMMIT_WMAP */
+
+                       /* free child extents covered by parent */
+                       xadlock.xdlist = &p->xad[XTENTRYSTART];
+                       xadlock.count =
+                           le16_to_cpu(p->header.nextindex) -
+                           XTENTRYSTART;
+                       txFreeMap(ip, (struct maplock *) & xadlock, NULL,
+                                 COMMIT_WMAP);
+               }
+               BT_MARK_DIRTY(mp, ip);
+
+               if (p->header.flag & BT_ROOT) {
+                       p->header.flag &= ~BT_INTERNAL;
+                       p->header.flag |= BT_LEAF;
+                       p->header.nextindex = cpu_to_le16(XTENTRYSTART);
+                       if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
+                               /*
+                                * Shrink root down to allow inline
+                                * EA (otherwise fsck complains)
+                                */
+                               p->header.maxentry =
+                                   cpu_to_le16(XTROOTINITSLOT);
+                               JFS_IP(ip)->mode2 |= INLINEEA;
+                       }
+
+                       XT_PUTPAGE(mp); /* debug */
+                       goto out;
+               } else {
+                       if (log) {      /* COMMIT_PWMAP */
+                               /* page will be invalidated at tx completion
+                                */
+                               XT_PUTPAGE(mp);
+                       } else {        /* COMMIT_WMAP */
+
+                               if (mp->lid)
+                                       lid_to_tlock(mp->lid)->flag |=
+                                               tlckFREELOCK;
+
+                               /* invalidate parent page */
+                               discard_metapage(mp);
+                       }
+
+                       /* parent has become empty and freed:
+                        * go back up to its parent page
+                        */
+                       /* freed = 1; */
+                       goto getParent;
+               }
+       }
+       /*
+        * parent page still has entries for front region;
+        */
+       else {
+               /* try truncate region covered by preceding entry
+                * (process backward)
+                */
+               index--;
+
+               /* go back down to the child page corresponding
+                * to the entry
+                */
+               goto getChild;
+       }
+
+       /*
+        *      internal page: go down to child page of current entry
+        */
+      getChild:
+       /* save current parent entry for the child page */
+       if (BT_STACK_FULL(&btstack)) {
+               jfs_error(ip->i_sb, "stack overrun!\n");
+               XT_PUTPAGE(mp);
+               return -EIO;
+       }
+       BT_PUSH(&btstack, bn, index);
+
+       /* get child page */
+       xad = &p->xad[index];
+       bn = addressXAD(xad);
+
+       /*
+        * first access of each internal entry:
+        */
+       /* release parent page */
+       XT_PUTPAGE(mp);
+
+       /* process the child page */
+       goto getPage;
+
+      out:
+       /*
+        * update file resource stat
+        */
+       /* set size
+        */
+       if (S_ISDIR(ip->i_mode) && !newsize)
+               ip->i_size = 1; /* fsck hates zero-length directories */
+       else
+               ip->i_size = newsize;
+
+       /* update quota allocation to reflect freed blocks */
+       dquot_free_block(ip, nfreed);
+
+       /*
+        * free tlock of invalidated pages
+        */
+       if (flag == COMMIT_WMAP)
+               txFreelock(ip);
+
+       return newsize;
+}
+
+
+/*
+ *     xtTruncate_pmap()
+ *
+ * function:
+ *     Perform truncate to zero length for deleted file, leaving the
+ *     the xtree and working map untouched.  This allows the file to
+ *     be accessed via open file handles, while the delete of the file
+ *     is committed to disk.
+ *
+ * parameter:
+ *     tid_t           tid,
+ *     struct inode    *ip,
+ *     s64             committed_size)
+ *
+ * return: new committed size
+ *
+ * note:
+ *
+ *     To avoid deadlock by holding too many transaction locks, the
+ *     truncation may be broken up into multiple transactions.
+ *     The committed_size keeps track of part of the file has been
+ *     freed from the pmaps.
+ */
+s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
+{
+       s64 bn;
+       struct btstack btstack;
+       int cmp;
+       int index;
+       int locked_leaves = 0;
+       struct metapage *mp;
+       xtpage_t *p;
+       struct btframe *parent;
+       int rc;
+       struct tblock *tblk;
+       struct tlock *tlck = NULL;
+       xad_t *xad;
+       int xlen;
+       s64 xoff;
+       struct xtlock *xtlck = NULL;
+
+       /* save object truncation type */
+       tblk = tid_to_tblock(tid);
+       tblk->xflag |= COMMIT_PMAP;
+
+       /* clear stack */
+       BT_CLR(&btstack);
+
+       if (committed_size) {
+               xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
+               rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
+               if (rc)
+                       return rc;
+
+               XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
+
+               if (cmp != 0) {
+                       XT_PUTPAGE(mp);
+                       jfs_error(ip->i_sb, "did not find extent\n");
+                       return -EIO;
+               }
+       } else {
+               /*
+                * start with root
+                *
+                * root resides in the inode
+                */
+               bn = 0;
+
+               /*
+                * first access of each page:
+                */
+      getPage:
+               XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+               if (rc)
+                       return rc;
+
+               /* process entries backward from last index */
+               index = le16_to_cpu(p->header.nextindex) - 1;
+
+               if (p->header.flag & BT_INTERNAL)
+                       goto getChild;
+       }
+
+       /*
+        *      leaf page
+        */
+
+       if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
+               /*
+                * We need to limit the size of the transaction
+                * to avoid exhausting pagecache & tlocks
+                */
+               xad = &p->xad[index];
+               xoff = offsetXAD(xad);
+               xlen = lengthXAD(xad);
+               XT_PUTPAGE(mp);
+               return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
+       }
+       tlck = txLock(tid, ip, mp, tlckXTREE);
+       tlck->type = tlckXTREE | tlckFREE;
+       xtlck = (struct xtlock *) & tlck->lock;
+       xtlck->hwm.offset = index;
+
+
+       XT_PUTPAGE(mp);
+
+       /*
+        * go back up to the parent page
+        */
+      getParent:
+       /* pop/restore parent entry for the current child page */
+       if ((parent = BT_POP(&btstack)) == NULL)
+               /* current page must have been root */
+               goto out;
+
+       /* get back the parent page */
+       bn = parent->bn;
+       XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
+       if (rc)
+               return rc;
+
+       index = parent->index;
+
+       /*
+        * parent page become empty: free the page
+        */
+       if (index == XTENTRYSTART) {
+               /* txCommit() with tlckFREE:
+                * free child extents covered by parent;
+                * invalidate parent if COMMIT_PWMAP;
+                */
+               tlck = txLock(tid, ip, mp, tlckXTREE);
+               xtlck = (struct xtlock *) & tlck->lock;
+               xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
+               tlck->type = tlckXTREE | tlckFREE;
+
+               XT_PUTPAGE(mp);
+
+               if (p->header.flag & BT_ROOT) {
+
+                       goto out;
+               } else {
+                       goto getParent;
+               }
+       }
+       /*
+        * parent page still has entries for front region;
+        */
+       else
+               index--;
+       /*
+        *      internal page: go down to child page of current entry
+        */
+      getChild:
+       /* save current parent entry for the child page */
+       if (BT_STACK_FULL(&btstack)) {
+               jfs_error(ip->i_sb, "stack overrun!\n");
+               XT_PUTPAGE(mp);
+               return -EIO;
+       }
+       BT_PUSH(&btstack, bn, index);
+
+       /* get child page */
+       xad = &p->xad[index];
+       bn = addressXAD(xad);
+
+       /*
+        * first access of each internal entry:
+        */
+       /* release parent page */
+       XT_PUTPAGE(mp);
+
+       /* process the child page */
+       goto getPage;
+
+      out:
+
+       return 0;
+}
+
+#ifdef CONFIG_JFS_STATISTICS
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
+{
+       seq_printf(m,
+                      "JFS Xtree statistics\n"
+                      "====================\n"
+                      "searches = %d\n"
+                      "fast searches = %d\n"
+                      "splits = %d\n",
+                      xtStat.search,
+                      xtStat.fastSearch,
+                      xtStat.split);
+       return 0;
+}
+
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, jfs_xtstat_proc_show, NULL);
+}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = jfs_xtstat_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+#endif