kernel/fs/xfs/xfs_bmap_util.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * Copyright (c) 2012 Red Hat, Inc.
   4  * All Rights Reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it would be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write the Free Software Foundation,
  17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18  */
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_shared.h"
  22 #include "xfs_format.h"
  23 #include "xfs_log_format.h"
  24 #include "xfs_trans_resv.h"
  25 #include "xfs_bit.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_da_format.h"
  28 #include "xfs_inode.h"
  29 #include "xfs_btree.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_extfree_item.h"
  32 #include "xfs_alloc.h"
  33 #include "xfs_bmap.h"
  34 #include "xfs_bmap_util.h"
  35 #include "xfs_bmap_btree.h"
  36 #include "xfs_rtalloc.h"
  37 #include "xfs_error.h"
  38 #include "xfs_quota.h"
  39 #include "xfs_trans_space.h"
  40 #include "xfs_trace.h"
  41 #include "xfs_icache.h"
  42 #include "xfs_log.h"
  43
  44 /* Kernel only BMAP related definitions and functions */
  45
  46 /*
  47  * Convert the given file system block to a disk block.  We have to treat it
  48  * differently based on whether the file is a real time file or not, because the
  49  * bmap code does.
  50  */
  51 xfs_daddr_t
  52 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  53 {
  54         return (XFS_IS_REALTIME_INODE(ip) ? \
  55                  (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
  56                  XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
  57 }
  58
  59 /*
  60  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  61  * caller.  Frees all the extents that need freeing, which must be done
  62  * last due to locking considerations.  We never free any extents in
  63  * the first transaction.
  64  *
  65  * Return 1 if the given transaction was committed and a new one
  66  * started, and 0 otherwise in the committed parameter.
  67  */
  68 int                                             /* error */
  69 xfs_bmap_finish(
  70         xfs_trans_t             **tp,           /* transaction pointer addr */
  71         xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
  72         int                     *committed)     /* xact committed or not */
  73 {
  74         xfs_efd_log_item_t      *efd;           /* extent free data */
  75         xfs_efi_log_item_t      *efi;           /* extent free intention */
  76         int                     error;          /* error return value */
  77         xfs_bmap_free_item_t    *free;          /* free extent item */
  78         struct xfs_trans_res    tres;           /* new log reservation */
  79         xfs_mount_t             *mp;            /* filesystem mount structure */
  80         xfs_bmap_free_item_t    *next;          /* next item on free list */
  81         xfs_trans_t             *ntp;           /* new transaction pointer */
  82
  83         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
  84         if (flist->xbf_count == 0) {
  85                 *committed = 0;
  86                 return 0;
  87         }
  88         ntp = *tp;
  89         efi = xfs_trans_get_efi(ntp, flist->xbf_count);
  90         for (free = flist->xbf_first; free; free = free->xbfi_next)
  91                 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
  92                         free->xbfi_blockcount);
  93
  94         tres.tr_logres = ntp->t_log_res;
  95         tres.tr_logcount = ntp->t_log_count;
  96         tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
  97         ntp = xfs_trans_dup(*tp);
  98         error = xfs_trans_commit(*tp, 0);
  99         *tp = ntp;
 100         *committed = 1;
 101         /*
 102          * We have a new transaction, so we should return committed=1,
 103          * even though we're returning an error.
 104          */
 105         if (error)
 106                 return error;
 107
 108         /*
 109          * transaction commit worked ok so we can drop the extra ticket
 110          * reference that we gained in xfs_trans_dup()
 111          */
 112         xfs_log_ticket_put(ntp->t_ticket);
 113
 114         error = xfs_trans_reserve(ntp, &tres, 0, 0);
 115         if (error)
 116                 return error;
 117         efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
 118         for (free = flist->xbf_first; free != NULL; free = next) {
 119                 next = free->xbfi_next;
 120                 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
 121                                 free->xbfi_blockcount))) {
 122                         /*
 123                          * The bmap free list will be cleaned up at a
 124                          * higher level.  The EFI will be canceled when
 125                          * this transaction is aborted.
 126                          * Need to force shutdown here to make sure it
 127                          * happens, since this transaction may not be
 128                          * dirty yet.
 129                          */
 130                         mp = ntp->t_mountp;
 131                         if (!XFS_FORCED_SHUTDOWN(mp))
 132                                 xfs_force_shutdown(mp,
 133                                                    (error == -EFSCORRUPTED) ?
 134                                                    SHUTDOWN_CORRUPT_INCORE :
 135                                                    SHUTDOWN_META_IO_ERROR);
 136                         return error;
 137                 }
 138                 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
 139                         free->xbfi_blockcount);
 140                 xfs_bmap_del_free(flist, NULL, free);
 141         }
 142         return 0;
 143 }
 144
 145 int
 146 xfs_bmap_rtalloc(
 147         struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
 148 {
 149         xfs_alloctype_t atype = 0;      /* type for allocation routines */
 150         int             error;          /* error return value */
 151         xfs_mount_t     *mp;            /* mount point structure */
 152         xfs_extlen_t    prod = 0;       /* product factor for allocators */
 153         xfs_extlen_t    ralen = 0;      /* realtime allocation length */
 154         xfs_extlen_t    align;          /* minimum allocation alignment */
 155         xfs_rtblock_t   rtb;
 156
 157         mp = ap->ip->i_mount;
 158         align = xfs_get_extsz_hint(ap->ip);
 159         prod = align / mp->m_sb.sb_rextsize;
 160         error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 161                                         align, 1, ap->eof, 0,
 162                                         ap->conv, &ap->offset, &ap->length);
 163         if (error)
 164                 return error;
 165         ASSERT(ap->length);
 166         ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
 167
 168         /*
 169          * If the offset & length are not perfectly aligned
 170          * then kill prod, it will just get us in trouble.
 171          */
 172         if (do_mod(ap->offset, align) || ap->length % align)
 173                 prod = 1;
 174         /*
 175          * Set ralen to be the actual requested length in rtextents.
 176          */
 177         ralen = ap->length / mp->m_sb.sb_rextsize;
 178         /*
 179          * If the old value was close enough to MAXEXTLEN that
 180          * we rounded up to it, cut it back so it's valid again.
 181          * Note that if it's a really large request (bigger than
 182          * MAXEXTLEN), we don't hear about that number, and can't
 183          * adjust the starting point to match it.
 184          */
 185         if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
 186                 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
 187
 188         /*
 189          * Lock out other modifications to the RT bitmap inode.
 190          */
 191         xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
 192         xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 193
 194         /*
 195          * If it's an allocation to an empty file at offset 0,
 196          * pick an extent that will space things out in the rt area.
 197          */
 198         if (ap->eof && ap->offset == 0) {
 199                 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
 200
 201                 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 202                 if (error)
 203                         return error;
 204                 ap->blkno = rtx * mp->m_sb.sb_rextsize;
 205         } else {
 206                 ap->blkno = 0;
 207         }
 208
 209         xfs_bmap_adjacent(ap);
 210
 211         /*
 212          * Realtime allocation, done through xfs_rtallocate_extent.
 213          */
 214         atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
 215         do_div(ap->blkno, mp->m_sb.sb_rextsize);
 216         rtb = ap->blkno;
 217         ap->length = ralen;
 218         if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
 219                                 &ralen, atype, ap->wasdel, prod, &rtb)))
 220                 return error;
 221         if (rtb == NULLFSBLOCK && prod > 1 &&
 222             (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
 223                                            ap->length, &ralen, atype,
 224                                            ap->wasdel, 1, &rtb)))
 225                 return error;
 226         ap->blkno = rtb;
 227         if (ap->blkno != NULLFSBLOCK) {
 228                 ap->blkno *= mp->m_sb.sb_rextsize;
 229                 ralen *= mp->m_sb.sb_rextsize;
 230                 ap->length = ralen;
 231                 ap->ip->i_d.di_nblocks += ralen;
 232                 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 233                 if (ap->wasdel)
 234                         ap->ip->i_delayed_blks -= ralen;
 235                 /*
 236                  * Adjust the disk quota also. This was reserved
 237                  * earlier.
 238                  */
 239                 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
 240                         ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
 241                                         XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
 242         } else {
 243                 ap->length = 0;
 244         }
 245         return 0;
 246 }
 247
 248 /*
 249  * Check if the endoff is outside the last extent. If so the caller will grow
 250  * the allocation to a stripe unit boundary.  All offsets are considered outside
 251  * the end of file for an empty fork, so 1 is returned in *eof in that case.
 252  */
 253 int
 254 xfs_bmap_eof(
 255         struct xfs_inode        *ip,
 256         xfs_fileoff_t           endoff,
 257         int                     whichfork,
 258         int                     *eof)
 259 {
 260         struct xfs_bmbt_irec    rec;
 261         int                     error;
 262
 263         error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
 264         if (error || *eof)
 265                 return error;
 266
 267         *eof = endoff >= rec.br_startoff + rec.br_blockcount;
 268         return 0;
 269 }
 270
 271 /*
 272  * Extent tree block counting routines.
 273  */
 274
 275 /*
 276  * Count leaf blocks given a range of extent records.
 277  */
 278 STATIC void
 279 xfs_bmap_count_leaves(
 280         xfs_ifork_t             *ifp,
 281         xfs_extnum_t            idx,
 282         int                     numrecs,
 283         int                     *count)
 284 {
 285         int             b;
 286
 287         for (b = 0; b < numrecs; b++) {
 288                 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
 289                 *count += xfs_bmbt_get_blockcount(frp);
 290         }
 291 }
 292
 293 /*
 294  * Count leaf blocks given a range of extent records originally
 295  * in btree format.
 296  */
 297 STATIC void
 298 xfs_bmap_disk_count_leaves(
 299         struct xfs_mount        *mp,
 300         struct xfs_btree_block  *block,
 301         int                     numrecs,
 302         int                     *count)
 303 {
 304         int             b;
 305         xfs_bmbt_rec_t  *frp;
 306
 307         for (b = 1; b <= numrecs; b++) {
 308                 frp = XFS_BMBT_REC_ADDR(mp, block, b);
 309                 *count += xfs_bmbt_disk_get_blockcount(frp);
 310         }
 311 }
 312
 313 /*
 314  * Recursively walks each level of a btree
 315  * to count total fsblocks in use.
 316  */
 317 STATIC int                                     /* error */
 318 xfs_bmap_count_tree(
 319         xfs_mount_t     *mp,            /* file system mount point */
 320         xfs_trans_t     *tp,            /* transaction pointer */
 321         xfs_ifork_t     *ifp,           /* inode fork pointer */
 322         xfs_fsblock_t   blockno,        /* file system block number */
 323         int             levelin,        /* level in btree */
 324         int             *count)         /* Count of blocks */
 325 {
 326         int                     error;
 327         xfs_buf_t               *bp, *nbp;
 328         int                     level = levelin;
 329         __be64                  *pp;
 330         xfs_fsblock_t           bno = blockno;
 331         xfs_fsblock_t           nextbno;
 332         struct xfs_btree_block  *block, *nextblock;
 333         int                     numrecs;
 334
 335         error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
 336                                                 &xfs_bmbt_buf_ops);
 337         if (error)
 338                 return error;
 339         *count += 1;
 340         block = XFS_BUF_TO_BLOCK(bp);
 341
 342         if (--level) {
 343                 /* Not at node above leaves, count this level of nodes */
 344                 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 345                 while (nextbno != NULLFSBLOCK) {
 346                         error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
 347                                                 XFS_BMAP_BTREE_REF,
 348                                                 &xfs_bmbt_buf_ops);
 349                         if (error)
 350                                 return error;
 351                         *count += 1;
 352                         nextblock = XFS_BUF_TO_BLOCK(nbp);
 353                         nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
 354                         xfs_trans_brelse(tp, nbp);
 355                 }
 356
 357                 /* Dive to the next level */
 358                 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
 359                 bno = be64_to_cpu(*pp);
 360                 if (unlikely((error =
 361                      xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
 362                         xfs_trans_brelse(tp, bp);
 363                         XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
 364                                          XFS_ERRLEVEL_LOW, mp);
 365                         return -EFSCORRUPTED;
 366                 }
 367                 xfs_trans_brelse(tp, bp);
 368         } else {
 369                 /* count all level 1 nodes and their leaves */
 370                 for (;;) {
 371                         nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 372                         numrecs = be16_to_cpu(block->bb_numrecs);
 373                         xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
 374                         xfs_trans_brelse(tp, bp);
 375                         if (nextbno == NULLFSBLOCK)
 376                                 break;
 377                         bno = nextbno;
 378                         error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
 379                                                 XFS_BMAP_BTREE_REF,
 380                                                 &xfs_bmbt_buf_ops);
 381                         if (error)
 382                                 return error;
 383                         *count += 1;
 384                         block = XFS_BUF_TO_BLOCK(bp);
 385                 }
 386         }
 387         return 0;
 388 }
 389
 390 /*
 391  * Count fsblocks of the given fork.
 392  */
 393 int                                             /* error */
 394 xfs_bmap_count_blocks(
 395         xfs_trans_t             *tp,            /* transaction pointer */
 396         xfs_inode_t             *ip,            /* incore inode */
 397         int                     whichfork,      /* data or attr fork */
 398         int                     *count)         /* out: count of blocks */
 399 {
 400         struct xfs_btree_block  *block; /* current btree block */
 401         xfs_fsblock_t           bno;    /* block # of "block" */
 402         xfs_ifork_t             *ifp;   /* fork structure */
 403         int                     level;  /* btree level, for checking */
 404         xfs_mount_t             *mp;    /* file system mount structure */
 405         __be64                  *pp;    /* pointer to block address */
 406
 407         bno = NULLFSBLOCK;
 408         mp = ip->i_mount;
 409         ifp = XFS_IFORK_PTR(ip, whichfork);
 410         if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
 411                 xfs_bmap_count_leaves(ifp, 0,
 412                         ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
 413                         count);
 414                 return 0;
 415         }
 416
 417         /*
 418          * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
 419          */
 420         block = ifp->if_broot;
 421         level = be16_to_cpu(block->bb_level);
 422         ASSERT(level > 0);
 423         pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 424         bno = be64_to_cpu(*pp);
 425         ASSERT(bno != NULLFSBLOCK);
 426         ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 427         ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 428
 429         if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
 430                 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
 431                                  mp);
 432                 return -EFSCORRUPTED;
 433         }
 434
 435         return 0;
 436 }
 437
 438 /*
 439  * returns 1 for success, 0 if we failed to map the extent.
 440  */
 441 STATIC int
 442 xfs_getbmapx_fix_eof_hole(
 443         xfs_inode_t             *ip,            /* xfs incore inode pointer */
 444         struct getbmapx         *out,           /* output structure */
 445         int                     prealloced,     /* this is a file with
 446                                                  * preallocated data space */
 447         __int64_t               end,            /* last block requested */
 448         xfs_fsblock_t           startblock)
 449 {
 450         __int64_t               fixlen;
 451         xfs_mount_t             *mp;            /* file system mount point */
 452         xfs_ifork_t             *ifp;           /* inode fork pointer */
 453         xfs_extnum_t            lastx;          /* last extent pointer */
 454         xfs_fileoff_t           fileblock;
 455
 456         if (startblock == HOLESTARTBLOCK) {
 457                 mp = ip->i_mount;
 458                 out->bmv_block = -1;
 459                 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
 460                 fixlen -= out->bmv_offset;
 461                 if (prealloced && out->bmv_offset + out->bmv_length == end) {
 462                         /* Came to hole at EOF. Trim it. */
 463                         if (fixlen <= 0)
 464                                 return 0;
 465                         out->bmv_length = fixlen;
 466                 }
 467         } else {
 468                 if (startblock == DELAYSTARTBLOCK)
 469                         out->bmv_block = -2;
 470                 else
 471                         out->bmv_block = xfs_fsb_to_db(ip, startblock);
 472                 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
 473                 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 474                 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
 475                    (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
 476                         out->bmv_oflags |= BMV_OF_LAST;
 477         }
 478
 479         return 1;
 480 }
 481
 482 /*
 483  * Get inode's extents as described in bmv, and format for output.
 484  * Calls formatter to fill the user's buffer until all extents
 485  * are mapped, until the passed-in bmv->bmv_count slots have
 486  * been filled, or until the formatter short-circuits the loop,
 487  * if it is tracking filled-in extents on its own.
 488  */
 489 int                                             /* error code */
 490 xfs_getbmap(
 491         xfs_inode_t             *ip,
 492         struct getbmapx         *bmv,           /* user bmap structure */
 493         xfs_bmap_format_t       formatter,      /* format to user */
 494         void                    *arg)           /* formatter arg */
 495 {
 496         __int64_t               bmvend;         /* last block requested */
 497         int                     error = 0;      /* return value */
 498         __int64_t               fixlen;         /* length for -1 case */
 499         int                     i;              /* extent number */
 500         int                     lock;           /* lock state */
 501         xfs_bmbt_irec_t         *map;           /* buffer for user's data */
 502         xfs_mount_t             *mp;            /* file system mount point */
 503         int                     nex;            /* # of user extents can do */
 504         int                     nexleft;        /* # of user extents left */
 505         int                     subnex;         /* # of bmapi's can do */
 506         int                     nmap;           /* number of map entries */
 507         struct getbmapx         *out;           /* output structure */
 508         int                     whichfork;      /* data or attr fork */
 509         int                     prealloced;     /* this is a file with
 510                                                  * preallocated data space */
 511         int                     iflags;         /* interface flags */
 512         int                     bmapi_flags;    /* flags for xfs_bmapi */
 513         int                     cur_ext = 0;
 514
 515         mp = ip->i_mount;
 516         iflags = bmv->bmv_iflags;
 517         whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 518
 519         if (whichfork == XFS_ATTR_FORK) {
 520                 if (XFS_IFORK_Q(ip)) {
 521                         if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
 522                             ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
 523                             ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
 524                                 return -EINVAL;
 525                 } else if (unlikely(
 526                            ip->i_d.di_aformat != 0 &&
 527                            ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
 528                         XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
 529                                          ip->i_mount);
 530                         return -EFSCORRUPTED;
 531                 }
 532
 533                 prealloced = 0;
 534                 fixlen = 1LL << 32;
 535         } else {
 536                 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 537                     ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 538                     ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
 539                         return -EINVAL;
 540
 541                 if (xfs_get_extsz_hint(ip) ||
 542                     ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
 543                         prealloced = 1;
 544                         fixlen = mp->m_super->s_maxbytes;
 545                 } else {
 546                         prealloced = 0;
 547                         fixlen = XFS_ISIZE(ip);
 548                 }
 549         }
 550
 551         if (bmv->bmv_length == -1) {
 552                 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
 553                 bmv->bmv_length =
 554                         max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
 555         } else if (bmv->bmv_length == 0) {
 556                 bmv->bmv_entries = 0;
 557                 return 0;
 558         } else if (bmv->bmv_length < 0) {
 559                 return -EINVAL;
 560         }
 561
 562         nex = bmv->bmv_count - 1;
 563         if (nex <= 0)
 564                 return -EINVAL;
 565         bmvend = bmv->bmv_offset + bmv->bmv_length;
 566
 567
 568         if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
 569                 return -ENOMEM;
 570         out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
 571         if (!out)
 572                 return -ENOMEM;
 573
 574         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 575         if (whichfork == XFS_DATA_FORK) {
 576                 if (!(iflags & BMV_IF_DELALLOC) &&
 577                     (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
 578                         error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
 579                         if (error)
 580                                 goto out_unlock_iolock;
 581
 582                         /*
 583                          * Even after flushing the inode, there can still be
 584                          * delalloc blocks on the inode beyond EOF due to
 585                          * speculative preallocation.  These are not removed
 586                          * until the release function is called or the inode
 587                          * is inactivated.  Hence we cannot assert here that
 588                          * ip->i_delayed_blks == 0.
 589                          */
 590                 }
 591
 592                 lock = xfs_ilock_data_map_shared(ip);
 593         } else {
 594                 lock = xfs_ilock_attr_map_shared(ip);
 595         }
 596
 597         /*
 598          * Don't let nex be bigger than the number of extents
 599          * we can have assuming alternating holes and real extents.
 600          */
 601         if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 602                 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 603
 604         bmapi_flags = xfs_bmapi_aflag(whichfork);
 605         if (!(iflags & BMV_IF_PREALLOC))
 606                 bmapi_flags |= XFS_BMAPI_IGSTATE;
 607
 608         /*
 609          * Allocate enough space to handle "subnex" maps at a time.
 610          */
 611         error = -ENOMEM;
 612         subnex = 16;
 613         map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
 614         if (!map)
 615                 goto out_unlock_ilock;
 616
 617         bmv->bmv_entries = 0;
 618
 619         if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
 620             (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
 621                 error = 0;
 622                 goto out_free_map;
 623         }
 624
 625         nexleft = nex;
 626
 627         do {
 628                 nmap = (nexleft > subnex) ? subnex : nexleft;
 629                 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 630                                        XFS_BB_TO_FSB(mp, bmv->bmv_length),
 631                                        map, &nmap, bmapi_flags);
 632                 if (error)
 633                         goto out_free_map;
 634                 ASSERT(nmap <= subnex);
 635
 636                 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
 637                         out[cur_ext].bmv_oflags = 0;
 638                         if (map[i].br_state == XFS_EXT_UNWRITTEN)
 639                                 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
 640                         else if (map[i].br_startblock == DELAYSTARTBLOCK)
 641                                 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
 642                         out[cur_ext].bmv_offset =
 643                                 XFS_FSB_TO_BB(mp, map[i].br_startoff);
 644                         out[cur_ext].bmv_length =
 645                                 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 646                         out[cur_ext].bmv_unused1 = 0;
 647                         out[cur_ext].bmv_unused2 = 0;
 648
 649                         /*
 650                          * delayed allocation extents that start beyond EOF can
 651                          * occur due to speculative EOF allocation when the
 652                          * delalloc extent is larger than the largest freespace
 653                          * extent at conversion time. These extents cannot be
 654                          * converted by data writeback, so can exist here even
 655                          * if we are not supposed to be finding delalloc
 656                          * extents.
 657                          */
 658                         if (map[i].br_startblock == DELAYSTARTBLOCK &&
 659                             map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
 660                                 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
 661
 662                         if (map[i].br_startblock == HOLESTARTBLOCK &&
 663                             whichfork == XFS_ATTR_FORK) {
 664                                 /* came to the end of attribute fork */
 665                                 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
 666                                 goto out_free_map;
 667                         }
 668
 669                         if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
 670                                         prealloced, bmvend,
 671                                         map[i].br_startblock))
 672                                 goto out_free_map;
 673
 674                         bmv->bmv_offset =
 675                                 out[cur_ext].bmv_offset +
 676                                 out[cur_ext].bmv_length;
 677                         bmv->bmv_length =
 678                                 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
 679
 680                         /*
 681                          * In case we don't want to return the hole,
 682                          * don't increase cur_ext so that we can reuse
 683                          * it in the next loop.
 684                          */
 685                         if ((iflags & BMV_IF_NO_HOLES) &&
 686                             map[i].br_startblock == HOLESTARTBLOCK) {
 687                                 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
 688                                 continue;
 689                         }
 690
 691                         nexleft--;
 692                         bmv->bmv_entries++;
 693                         cur_ext++;
 694                 }
 695         } while (nmap && nexleft && bmv->bmv_length);
 696
 697  out_free_map:
 698         kmem_free(map);
 699  out_unlock_ilock:
 700         xfs_iunlock(ip, lock);
 701  out_unlock_iolock:
 702         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 703
 704         for (i = 0; i < cur_ext; i++) {
 705                 int full = 0;   /* user array is full */
 706
 707                 /* format results & advance arg */
 708                 error = formatter(&arg, &out[i], &full);
 709                 if (error || full)
 710                         break;
 711         }
 712
 713         kmem_free(out);
 714         return error;
 715 }
 716
 717 /*
 718  * dead simple method of punching delalyed allocation blocks from a range in
 719  * the inode. Walks a block at a time so will be slow, but is only executed in
 720  * rare error cases so the overhead is not critical. This will always punch out
 721  * both the start and end blocks, even if the ranges only partially overlap
 722  * them, so it is up to the caller to ensure that partial blocks are not
 723  * passed in.
 724  */
 725 int
 726 xfs_bmap_punch_delalloc_range(
 727         struct xfs_inode        *ip,
 728         xfs_fileoff_t           start_fsb,
 729         xfs_fileoff_t           length)
 730 {
 731         xfs_fileoff_t           remaining = length;
 732         int                     error = 0;
 733
 734         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 735
 736         do {
 737                 int             done;
 738                 xfs_bmbt_irec_t imap;
 739                 int             nimaps = 1;
 740                 xfs_fsblock_t   firstblock;
 741                 xfs_bmap_free_t flist;
 742
 743                 /*
 744                  * Map the range first and check that it is a delalloc extent
 745                  * before trying to unmap the range. Otherwise we will be
 746                  * trying to remove a real extent (which requires a
 747                  * transaction) or a hole, which is probably a bad idea...
 748                  */
 749                 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
 750                                        XFS_BMAPI_ENTIRE);
 751
 752                 if (error) {
 753                         /* something screwed, just bail */
 754                         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 755                                 xfs_alert(ip->i_mount,
 756                         "Failed delalloc mapping lookup ino %lld fsb %lld.",
 757                                                 ip->i_ino, start_fsb);
 758                         }
 759                         break;
 760                 }
 761                 if (!nimaps) {
 762                         /* nothing there */
 763                         goto next_block;
 764                 }
 765                 if (imap.br_startblock != DELAYSTARTBLOCK) {
 766                         /* been converted, ignore */
 767                         goto next_block;
 768                 }
 769                 WARN_ON(imap.br_blockcount == 0);
 770
 771                 /*
 772                  * Note: while we initialise the firstblock/flist pair, they
 773                  * should never be used because blocks should never be
 774                  * allocated or freed for a delalloc extent and hence we need
 775                  * don't cancel or finish them after the xfs_bunmapi() call.
 776                  */
 777                 xfs_bmap_init(&flist, &firstblock);
 778                 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
 779                                         &flist, &done);
 780                 if (error)
 781                         break;
 782
 783                 ASSERT(!flist.xbf_count && !flist.xbf_first);
 784 next_block:
 785                 start_fsb++;
 786                 remaining--;
 787         } while(remaining > 0);
 788
 789         return error;
 790 }
 791
 792 /*
 793  * Test whether it is appropriate to check an inode for and free post EOF
 794  * blocks. The 'force' parameter determines whether we should also consider
 795  * regular files that are marked preallocated or append-only.
 796  */
 797 bool
 798 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 799 {
 800         /* prealloc/delalloc exists only on regular files */
 801         if (!S_ISREG(ip->i_d.di_mode))
 802                 return false;
 803
 804         /*
 805          * Zero sized files with no cached pages and delalloc blocks will not
 806          * have speculative prealloc/delalloc blocks to remove.
 807          */
 808         if (VFS_I(ip)->i_size == 0 &&
 809             VFS_I(ip)->i_mapping->nrpages == 0 &&
 810             ip->i_delayed_blks == 0)
 811                 return false;
 812
 813         /* If we haven't read in the extent list, then don't do it now. */
 814         if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
 815                 return false;
 816
 817         /*
 818          * Do not free real preallocated or append-only files unless the file
 819          * has delalloc blocks and we are forced to remove them.
 820          */
 821         if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
 822                 if (!force || ip->i_delayed_blks == 0)
 823                         return false;
 824
 825         return true;
 826 }
 827
 828 /*
 829  * This is called by xfs_inactive to free any blocks beyond eof
 830  * when the link count isn't zero and by xfs_dm_punch_hole() when
 831  * punching a hole to EOF.
 832  */
 833 int
 834 xfs_free_eofblocks(
 835         xfs_mount_t     *mp,
 836         xfs_inode_t     *ip,
 837         bool            need_iolock)
 838 {
 839         xfs_trans_t     *tp;
 840         int             error;
 841         xfs_fileoff_t   end_fsb;
 842         xfs_fileoff_t   last_fsb;
 843         xfs_filblks_t   map_len;
 844         int             nimaps;
 845         xfs_bmbt_irec_t imap;
 846
 847         /*
 848          * Figure out if there are any blocks beyond the end
 849          * of the file.  If not, then there is nothing to do.
 850          */
 851         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 852         last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 853         if (last_fsb <= end_fsb)
 854                 return 0;
 855         map_len = last_fsb - end_fsb;
 856
 857         nimaps = 1;
 858         xfs_ilock(ip, XFS_ILOCK_SHARED);
 859         error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 860         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 861
 862         if (!error && (nimaps != 0) &&
 863             (imap.br_startblock != HOLESTARTBLOCK ||
 864              ip->i_delayed_blks)) {
 865                 /*
 866                  * Attach the dquots to the inode up front.
 867                  */
 868                 error = xfs_qm_dqattach(ip, 0);
 869                 if (error)
 870                         return error;
 871
 872                 /*
 873                  * There are blocks after the end of file.
 874                  * Free them up now by truncating the file to
 875                  * its current size.
 876                  */
 877                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 878
 879                 if (need_iolock) {
 880                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 881                                 xfs_trans_cancel(tp, 0);
 882                                 return -EAGAIN;
 883                         }
 884                 }
 885
 886                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 887                 if (error) {
 888                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 889                         xfs_trans_cancel(tp, 0);
 890                         if (need_iolock)
 891                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 892                         return error;
 893                 }
 894
 895                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 896                 xfs_trans_ijoin(tp, ip, 0);
 897
 898                 /*
 899                  * Do not update the on-disk file size.  If we update the
 900                  * on-disk file size and then the system crashes before the
 901                  * contents of the file are flushed to disk then the files
 902                  * may be full of holes (ie NULL files bug).
 903                  */
 904                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
 905                                               XFS_ISIZE(ip));
 906                 if (error) {
 907                         /*
 908                          * If we get an error at this point we simply don't
 909                          * bother truncating the file.
 910                          */
 911                         xfs_trans_cancel(tp,
 912                                          (XFS_TRANS_RELEASE_LOG_RES |
 913                                           XFS_TRANS_ABORT));
 914                 } else {
 915                         error = xfs_trans_commit(tp,
 916                                                 XFS_TRANS_RELEASE_LOG_RES);
 917                         if (!error)
 918                                 xfs_inode_clear_eofblocks_tag(ip);
 919                 }
 920
 921                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 922                 if (need_iolock)
 923                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 924         }
 925         return error;
 926 }
 927
 928 int
 929 xfs_alloc_file_space(
 930         struct xfs_inode        *ip,
 931         xfs_off_t               offset,
 932         xfs_off_t               len,
 933         int                     alloc_type)
 934 {
 935         xfs_mount_t             *mp = ip->i_mount;
 936         xfs_off_t               count;
 937         xfs_filblks_t           allocated_fsb;
 938         xfs_filblks_t           allocatesize_fsb;
 939         xfs_extlen_t            extsz, temp;
 940         xfs_fileoff_t           startoffset_fsb;
 941         xfs_fsblock_t           firstfsb;
 942         int                     nimaps;
 943         int                     quota_flag;
 944         int                     rt;
 945         xfs_trans_t             *tp;
 946         xfs_bmbt_irec_t         imaps[1], *imapp;
 947         xfs_bmap_free_t         free_list;
 948         uint                    qblocks, resblks, resrtextents;
 949         int                     committed;
 950         int                     error;
 951
 952         trace_xfs_alloc_file_space(ip);
 953
 954         if (XFS_FORCED_SHUTDOWN(mp))
 955                 return -EIO;
 956
 957         error = xfs_qm_dqattach(ip, 0);
 958         if (error)
 959                 return error;
 960
 961         if (len <= 0)
 962                 return -EINVAL;
 963
 964         rt = XFS_IS_REALTIME_INODE(ip);
 965         extsz = xfs_get_extsz_hint(ip);
 966
 967         count = len;
 968         imapp = &imaps[0];
 969         nimaps = 1;
 970         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
 971         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 972
 973         /*
 974          * Allocate file space until done or until there is an error
 975          */
 976         while (allocatesize_fsb && !error) {
 977                 xfs_fileoff_t   s, e;
 978
 979                 /*
 980                  * Determine space reservations for data/realtime.
 981                  */
 982                 if (unlikely(extsz)) {
 983                         s = startoffset_fsb;
 984                         do_div(s, extsz);
 985                         s *= extsz;
 986                         e = startoffset_fsb + allocatesize_fsb;
 987                         if ((temp = do_mod(startoffset_fsb, extsz)))
 988                                 e += temp;
 989                         if ((temp = do_mod(e, extsz)))
 990                                 e += extsz - temp;
 991                 } else {
 992                         s = 0;
 993                         e = allocatesize_fsb;
 994                 }
 995
 996                 /*
 997                  * The transaction reservation is limited to a 32-bit block
 998                  * count, hence we need to limit the number of blocks we are
 999                  * trying to reserve to avoid an overflow. We can't allocate
1000                  * more than @nimaps extents, and an extent is limited on disk
1001                  * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1002                  */
1003                 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1004                 if (unlikely(rt)) {
1005                         resrtextents = qblocks = resblks;
1006                         resrtextents /= mp->m_sb.sb_rextsize;
1007                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1008                         quota_flag = XFS_QMOPT_RES_RTBLKS;
1009                 } else {
1010                         resrtextents = 0;
1011                         resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1012                         quota_flag = XFS_QMOPT_RES_REGBLKS;
1013                 }
1014
1015                 /*
1016                  * Allocate and setup the transaction.
1017                  */
1018                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1019                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1020                                           resblks, resrtextents);
1021                 /*
1022                  * Check for running out of space
1023                  */
1024                 if (error) {
1025                         /*
1026                          * Free the transaction structure.
1027                          */
1028                         ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1029                         xfs_trans_cancel(tp, 0);
1030                         break;
1031                 }
1032                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1033                 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1034                                                       0, quota_flag);
1035                 if (error)
1036                         goto error1;
1037
1038                 xfs_trans_ijoin(tp, ip, 0);
1039
1040                 xfs_bmap_init(&free_list, &firstfsb);
1041                 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1042                                         allocatesize_fsb, alloc_type, &firstfsb,
1043                                         0, imapp, &nimaps, &free_list);
1044                 if (error) {
1045                         goto error0;
1046                 }
1047
1048                 /*
1049                  * Complete the transaction
1050                  */
1051                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1052                 if (error) {
1053                         goto error0;
1054                 }
1055
1056                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1057                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1058                 if (error) {
1059                         break;
1060                 }
1061
1062                 allocated_fsb = imapp->br_blockcount;
1063
1064                 if (nimaps == 0) {
1065                         error = -ENOSPC;
1066                         break;
1067                 }
1068
1069                 startoffset_fsb += allocated_fsb;
1070                 allocatesize_fsb -= allocated_fsb;
1071         }
1072
1073         return error;
1074
1075 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1076         xfs_bmap_cancel(&free_list);
1077         xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1078
1079 error1: /* Just cancel transaction */
1080         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1081         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1082         return error;
1083 }
1084
1085 /*
1086  * Zero file bytes between startoff and endoff inclusive.
1087  * The iolock is held exclusive and no blocks are buffered.
1088  *
1089  * This function is used by xfs_free_file_space() to zero
1090  * partial blocks when the range to free is not block aligned.
1091  * When unreserving space with boundaries that are not block
1092  * aligned we round up the start and round down the end
1093  * boundaries and then use this function to zero the parts of
1094  * the blocks that got dropped during the rounding.
1095  */
1096 STATIC int
1097 xfs_zero_remaining_bytes(
1098         xfs_inode_t             *ip,
1099         xfs_off_t               startoff,
1100         xfs_off_t               endoff)
1101 {
1102         xfs_bmbt_irec_t         imap;
1103         xfs_fileoff_t           offset_fsb;
1104         xfs_off_t               lastoffset;
1105         xfs_off_t               offset;
1106         xfs_buf_t               *bp;
1107         xfs_mount_t             *mp = ip->i_mount;
1108         int                     nimap;
1109         int                     error = 0;
1110
1111         /*
1112          * Avoid doing I/O beyond eof - it's not necessary
1113          * since nothing can read beyond eof.  The space will
1114          * be zeroed when the file is extended anyway.
1115          */
1116         if (startoff >= XFS_ISIZE(ip))
1117                 return 0;
1118
1119         if (endoff > XFS_ISIZE(ip))
1120                 endoff = XFS_ISIZE(ip);
1121
1122         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1123                 uint lock_mode;
1124
1125                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1126                 nimap = 1;
1127
1128                 lock_mode = xfs_ilock_data_map_shared(ip);
1129                 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1130                 xfs_iunlock(ip, lock_mode);
1131
1132                 if (error || nimap < 1)
1133                         break;
1134                 ASSERT(imap.br_blockcount >= 1);
1135                 ASSERT(imap.br_startoff == offset_fsb);
1136                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1137                 if (lastoffset > endoff)
1138                         lastoffset = endoff;
1139                 if (imap.br_startblock == HOLESTARTBLOCK)
1140                         continue;
1141                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1142                 if (imap.br_state == XFS_EXT_UNWRITTEN)
1143                         continue;
1144
1145                 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
1146                                 mp->m_rtdev_targp : mp->m_ddev_targp,
1147                                 xfs_fsb_to_db(ip, imap.br_startblock),
1148                                 BTOBB(mp->m_sb.sb_blocksize),
1149                                 0, &bp, NULL);
1150                 if (error)
1151                         return error;
1152
1153                 memset(bp->b_addr +
1154                                 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1155                        0, lastoffset - offset + 1);
1156
1157                 error = xfs_bwrite(bp);
1158                 xfs_buf_relse(bp);
1159                 if (error)
1160                         return error;
1161         }
1162         return error;
1163 }
1164
1165 int
1166 xfs_free_file_space(
1167         struct xfs_inode        *ip,
1168         xfs_off_t               offset,
1169         xfs_off_t               len)
1170 {
1171         int                     committed;
1172         int                     done;
1173         xfs_fileoff_t           endoffset_fsb;
1174         int                     error;
1175         xfs_fsblock_t           firstfsb;
1176         xfs_bmap_free_t         free_list;
1177         xfs_bmbt_irec_t         imap;
1178         xfs_off_t               ioffset;
1179         xfs_off_t               iendoffset;
1180         xfs_extlen_t            mod=0;
1181         xfs_mount_t             *mp;
1182         int                     nimap;
1183         uint                    resblks;
1184         xfs_off_t               rounding;
1185         int                     rt;
1186         xfs_fileoff_t           startoffset_fsb;
1187         xfs_trans_t             *tp;
1188
1189         mp = ip->i_mount;
1190
1191         trace_xfs_free_file_space(ip);
1192
1193         error = xfs_qm_dqattach(ip, 0);
1194         if (error)
1195                 return error;
1196
1197         error = 0;
1198         if (len <= 0)   /* if nothing being freed */
1199                 return error;
1200         rt = XFS_IS_REALTIME_INODE(ip);
1201         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1202         endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1203
1204         /* wait for the completion of any pending DIOs */
1205         inode_dio_wait(VFS_I(ip));
1206
1207         rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1208         ioffset = round_down(offset, rounding);
1209         iendoffset = round_up(offset + len, rounding) - 1;
1210         error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
1211                                              iendoffset);
1212         if (error)
1213                 goto out;
1214         truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
1215
1216         /*
1217          * Need to zero the stuff we're not freeing, on disk.
1218          * If it's a realtime file & can't use unwritten extents then we
1219          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
1220          * will take care of it for us.
1221          */
1222         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1223                 nimap = 1;
1224                 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1225                                         &imap, &nimap, 0);
1226                 if (error)
1227                         goto out;
1228                 ASSERT(nimap == 0 || nimap == 1);
1229                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1230                         xfs_daddr_t     block;
1231
1232                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1233                         block = imap.br_startblock;
1234                         mod = do_div(block, mp->m_sb.sb_rextsize);
1235                         if (mod)
1236                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1237                 }
1238                 nimap = 1;
1239                 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1240                                         &imap, &nimap, 0);
1241                 if (error)
1242                         goto out;
1243                 ASSERT(nimap == 0 || nimap == 1);
1244                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1245                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1246                         mod++;
1247                         if (mod && (mod != mp->m_sb.sb_rextsize))
1248                                 endoffset_fsb -= mod;
1249                 }
1250         }
1251         if ((done = (endoffset_fsb <= startoffset_fsb)))
1252                 /*
1253                  * One contiguous piece to clear
1254                  */
1255                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1256         else {
1257                 /*
1258                  * Some full blocks, possibly two pieces to clear
1259                  */
1260                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1261                         error = xfs_zero_remaining_bytes(ip, offset,
1262                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1263                 if (!error &&
1264                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1265                         error = xfs_zero_remaining_bytes(ip,
1266                                 XFS_FSB_TO_B(mp, endoffset_fsb),
1267                                 offset + len - 1);
1268         }
1269
1270         /*
1271          * free file space until done or until there is an error
1272          */
1273         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1274         while (!error && !done) {
1275
1276                 /*
1277                  * allocate and setup the transaction. Allow this
1278                  * transaction to dip into the reserve blocks to ensure
1279                  * the freeing of the space succeeds at ENOSPC.
1280                  */
1281                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1282                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1283
1284                 /*
1285                  * check for running out of space
1286                  */
1287                 if (error) {
1288                         /*
1289                          * Free the transaction structure.
1290                          */
1291                         ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1292                         xfs_trans_cancel(tp, 0);
1293                         break;
1294                 }
1295                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1296                 error = xfs_trans_reserve_quota(tp, mp,
1297                                 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1298                                 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1299                 if (error)
1300                         goto error1;
1301
1302                 xfs_trans_ijoin(tp, ip, 0);
1303
1304                 /*
1305                  * issue the bunmapi() call to free the blocks
1306                  */
1307                 xfs_bmap_init(&free_list, &firstfsb);
1308                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1309                                   endoffset_fsb - startoffset_fsb,
1310                                   0, 2, &firstfsb, &free_list, &done);
1311                 if (error) {
1312                         goto error0;
1313                 }
1314
1315                 /*
1316                  * complete the transaction
1317                  */
1318                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1319                 if (error) {
1320                         goto error0;
1321                 }
1322
1323                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1324                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1325         }
1326
1327  out:
1328         return error;
1329
1330  error0:
1331         xfs_bmap_cancel(&free_list);
1332  error1:
1333         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1334         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1335         goto out;
1336 }
1337
1338 /*
1339  * Preallocate and zero a range of a file. This mechanism has the allocation
1340  * semantics of fallocate and in addition converts data in the range to zeroes.
1341  */
1342 int
1343 xfs_zero_file_space(
1344         struct xfs_inode        *ip,
1345         xfs_off_t               offset,
1346         xfs_off_t               len)
1347 {
1348         struct xfs_mount        *mp = ip->i_mount;
1349         uint                    blksize;
1350         int                     error;
1351
1352         trace_xfs_zero_file_space(ip);
1353
1354         blksize = 1 << mp->m_sb.sb_blocklog;
1355
1356         /*
1357          * Punch a hole and prealloc the range. We use hole punch rather than
1358          * unwritten extent conversion for two reasons:
1359          *
1360          * 1.) Hole punch handles partial block zeroing for us.
1361          *
1362          * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
1363          * by virtue of the hole punch.
1364          */
1365         error = xfs_free_file_space(ip, offset, len);
1366         if (error)
1367                 goto out;
1368
1369         error = xfs_alloc_file_space(ip, round_down(offset, blksize),
1370                                      round_up(offset + len, blksize) -
1371                                      round_down(offset, blksize),
1372                                      XFS_BMAPI_PREALLOC);
1373 out:
1374         return error;
1375
1376 }
1377
1378 /*
1379  * @next_fsb will keep track of the extent currently undergoing shift.
1380  * @stop_fsb will keep track of the extent at which we have to stop.
1381  * If we are shifting left, we will start with block (offset + len) and
1382  * shift each extent till last extent.
1383  * If we are shifting right, we will start with last extent inside file space
1384  * and continue until we reach the block corresponding to offset.
1385  */
1386 static int
1387 xfs_shift_file_space(
1388         struct xfs_inode        *ip,
1389         xfs_off_t               offset,
1390         xfs_off_t               len,
1391         enum shift_direction    direction)
1392 {
1393         int                     done = 0;
1394         struct xfs_mount        *mp = ip->i_mount;
1395         struct xfs_trans        *tp;
1396         int                     error;
1397         struct xfs_bmap_free    free_list;
1398         xfs_fsblock_t           first_block;
1399         int                     committed;
1400         xfs_fileoff_t           stop_fsb;
1401         xfs_fileoff_t           next_fsb;
1402         xfs_fileoff_t           shift_fsb;
1403
1404         ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1405
1406         if (direction == SHIFT_LEFT) {
1407                 next_fsb = XFS_B_TO_FSB(mp, offset + len);
1408                 stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1409         } else {
1410                 /*
1411                  * If right shift, delegate the work of initialization of
1412                  * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1413                  */
1414                 next_fsb = NULLFSBLOCK;
1415                 stop_fsb = XFS_B_TO_FSB(mp, offset);
1416         }
1417
1418         shift_fsb = XFS_B_TO_FSB(mp, len);
1419
1420         /*
1421          * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1422          * into the accessible region of the file.
1423          */
1424         if (xfs_can_free_eofblocks(ip, true)) {
1425                 error = xfs_free_eofblocks(mp, ip, false);
1426                 if (error)
1427                         return error;
1428         }
1429
1430         /*
1431          * Writeback and invalidate cache for the remainder of the file as we're
1432          * about to shift down every extent from offset to EOF.
1433          */
1434         error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1435                                              offset, -1);
1436         if (error)
1437                 return error;
1438         error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1439                                         offset >> PAGE_CACHE_SHIFT, -1);
1440         if (error)
1441                 return error;
1442
1443         /*
1444          * The extent shiting code works on extent granularity. So, if
1445          * stop_fsb is not the starting block of extent, we need to split
1446          * the extent at stop_fsb.
1447          */
1448         if (direction == SHIFT_RIGHT) {
1449                 error = xfs_bmap_split_extent(ip, stop_fsb);
1450                 if (error)
1451                         return error;
1452         }
1453
1454         while (!error && !done) {
1455                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1456                 /*
1457                  * We would need to reserve permanent block for transaction.
1458                  * This will come into picture when after shifting extent into
1459                  * hole we found that adjacent extents can be merged which
1460                  * may lead to freeing of a block during record update.
1461                  */
1462                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1463                                 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1464                 if (error) {
1465                         xfs_trans_cancel(tp, 0);
1466                         break;
1467                 }
1468
1469                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1470                 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1471                                 ip->i_gdquot, ip->i_pdquot,
1472                                 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1473                                 XFS_QMOPT_RES_REGBLKS);
1474                 if (error)
1475                         goto out;
1476
1477                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1478
1479                 xfs_bmap_init(&free_list, &first_block);
1480
1481                 /*
1482                  * We are using the write transaction in which max 2 bmbt
1483                  * updates are allowed
1484                  */
1485                 error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
1486                                 &done, stop_fsb, &first_block, &free_list,
1487                                 direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
1488                 if (error)
1489                         goto out;
1490
1491                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1492                 if (error)
1493                         goto out;
1494
1495                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1496         }
1497
1498         return error;
1499
1500 out:
1501         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1502         return error;
1503 }
1504
1505 /*
1506  * xfs_collapse_file_space()
1507  *      This routine frees disk space and shift extent for the given file.
1508  *      The first thing we do is to free data blocks in the specified range
1509  *      by calling xfs_free_file_space(). It would also sync dirty data
1510  *      and invalidate page cache over the region on which collapse range
1511  *      is working. And Shift extent records to the left to cover a hole.
1512  * RETURNS:
1513  *      0 on success
1514  *      errno on error
1515  *
1516  */
1517 int
1518 xfs_collapse_file_space(
1519         struct xfs_inode        *ip,
1520         xfs_off_t               offset,
1521         xfs_off_t               len)
1522 {
1523         int error;
1524
1525         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1526         trace_xfs_collapse_file_space(ip);
1527
1528         error = xfs_free_file_space(ip, offset, len);
1529         if (error)
1530                 return error;
1531
1532         return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
1533 }
1534
1535 /*
1536  * xfs_insert_file_space()
1537  *      This routine create hole space by shifting extents for the given file.
1538  *      The first thing we do is to sync dirty data and invalidate page cache
1539  *      over the region on which insert range is working. And split an extent
1540  *      to two extents at given offset by calling xfs_bmap_split_extent.
1541  *      And shift all extent records which are laying between [offset,
1542  *      last allocated extent] to the right to reserve hole range.
1543  * RETURNS:
1544  *      0 on success
1545  *      errno on error
1546  */
1547 int
1548 xfs_insert_file_space(
1549         struct xfs_inode        *ip,
1550         loff_t                  offset,
1551         loff_t                  len)
1552 {
1553         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1554         trace_xfs_insert_file_space(ip);
1555
1556         return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
1557 }
1558
1559 /*
1560  * We need to check that the format of the data fork in the temporary inode is
1561  * valid for the target inode before doing the swap. This is not a problem with
1562  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1563  * data fork depending on the space the attribute fork is taking so we can get
1564  * invalid formats on the target inode.
1565  *
1566  * E.g. target has space for 7 extents in extent format, temp inode only has
1567  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
1568  * btree, but when swapped it needs to be in extent format. Hence we can't just
1569  * blindly swap data forks on attr2 filesystems.
1570  *
1571  * Note that we check the swap in both directions so that we don't end up with
1572  * a corrupt temporary inode, either.
1573  *
1574  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1575  * inode will prevent this situation from occurring, so all we do here is
1576  * reject and log the attempt. basically we are putting the responsibility on
1577  * userspace to get this right.
1578  */
1579 static int
1580 xfs_swap_extents_check_format(
1581         xfs_inode_t     *ip,    /* target inode */
1582         xfs_inode_t     *tip)   /* tmp inode */
1583 {
1584
1585         /* Should never get a local format */
1586         if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1587             tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1588                 return -EINVAL;
1589
1590         /*
1591          * if the target inode has less extents that then temporary inode then
1592          * why did userspace call us?
1593          */
1594         if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1595                 return -EINVAL;
1596
1597         /*
1598          * if the target inode is in extent form and the temp inode is in btree
1599          * form then we will end up with the target inode in the wrong format
1600          * as we already know there are less extents in the temp inode.
1601          */
1602         if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1603             tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1604                 return -EINVAL;
1605
1606         /* Check temp in extent form to max in target */
1607         if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1608             XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1609                         XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1610                 return -EINVAL;
1611
1612         /* Check target in extent form to max in temp */
1613         if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1614             XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1615                         XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1616                 return -EINVAL;
1617
1618         /*
1619          * If we are in a btree format, check that the temp root block will fit
1620          * in the target and that it has enough extents to be in btree format
1621          * in the target.
1622          *
1623          * Note that we have to be careful to allow btree->extent conversions
1624          * (a common defrag case) which will occur when the temp inode is in
1625          * extent format...
1626          */
1627         if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1628                 if (XFS_IFORK_BOFF(ip) &&
1629                     XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1630                         return -EINVAL;
1631                 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1632                     XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1633                         return -EINVAL;
1634         }
1635
1636         /* Reciprocal target->temp btree format checks */
1637         if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1638                 if (XFS_IFORK_BOFF(tip) &&
1639                     XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1640                         return -EINVAL;
1641                 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1642                     XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1643                         return -EINVAL;
1644         }
1645
1646         return 0;
1647 }
1648
1649 static int
1650 xfs_swap_extent_flush(
1651         struct xfs_inode        *ip)
1652 {
1653         int     error;
1654
1655         error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1656         if (error)
1657                 return error;
1658         truncate_pagecache_range(VFS_I(ip), 0, -1);
1659
1660         /* Verify O_DIRECT for ftmp */
1661         if (VFS_I(ip)->i_mapping->nrpages)
1662                 return -EINVAL;
1663         return 0;
1664 }
1665
1666 int
1667 xfs_swap_extents(
1668         xfs_inode_t     *ip,    /* target inode */
1669         xfs_inode_t     *tip,   /* tmp inode */
1670         xfs_swapext_t   *sxp)
1671 {
1672         xfs_mount_t     *mp = ip->i_mount;
1673         xfs_trans_t     *tp;
1674         xfs_bstat_t     *sbp = &sxp->sx_stat;
1675         xfs_ifork_t     *tempifp, *ifp, *tifp;
1676         int             src_log_flags, target_log_flags;
1677         int             error = 0;
1678         int             aforkblks = 0;
1679         int             taforkblks = 0;
1680         __uint64_t      tmp;
1681         int             lock_flags;
1682
1683         tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1684         if (!tempifp) {
1685                 error = -ENOMEM;
1686                 goto out;
1687         }
1688
1689         /*
1690          * Lock the inodes against other IO, page faults and truncate to
1691          * begin with.  Then we can ensure the inodes are flushed and have no
1692          * page cache safely. Once we have done this we can take the ilocks and
1693          * do the rest of the checks.
1694          */
1695         lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1696         xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1697         xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1698
1699         /* Verify that both files have the same format */
1700         if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1701                 error = -EINVAL;
1702                 goto out_unlock;
1703         }
1704
1705         /* Verify both files are either real-time or non-realtime */
1706         if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1707                 error = -EINVAL;
1708                 goto out_unlock;
1709         }
1710
1711         error = xfs_swap_extent_flush(ip);
1712         if (error)
1713                 goto out_unlock;
1714         error = xfs_swap_extent_flush(tip);
1715         if (error)
1716                 goto out_unlock;
1717
1718         tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1719         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1720         if (error) {
1721                 xfs_trans_cancel(tp, 0);
1722                 goto out_unlock;
1723         }
1724
1725         /*
1726          * Lock and join the inodes to the tansaction so that transaction commit
1727          * or cancel will unlock the inodes from this point onwards.
1728          */
1729         xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1730         lock_flags |= XFS_ILOCK_EXCL;
1731         xfs_trans_ijoin(tp, ip, lock_flags);
1732         xfs_trans_ijoin(tp, tip, lock_flags);
1733
1734
1735         /* Verify all data are being swapped */
1736         if (sxp->sx_offset != 0 ||
1737             sxp->sx_length != ip->i_d.di_size ||
1738             sxp->sx_length != tip->i_d.di_size) {
1739                 error = -EFAULT;
1740                 goto out_trans_cancel;
1741         }
1742
1743         trace_xfs_swap_extent_before(ip, 0);
1744         trace_xfs_swap_extent_before(tip, 1);
1745
1746         /* check inode formats now that data is flushed */
1747         error = xfs_swap_extents_check_format(ip, tip);
1748         if (error) {
1749                 xfs_notice(mp,
1750                     "%s: inode 0x%llx format is incompatible for exchanging.",
1751                                 __func__, ip->i_ino);
1752                 goto out_trans_cancel;
1753         }
1754
1755         /*
1756          * Compare the current change & modify times with that
1757          * passed in.  If they differ, we abort this swap.
1758          * This is the mechanism used to ensure the calling
1759          * process that the file was not changed out from
1760          * under it.
1761          */
1762         if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1763             (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1764             (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1765             (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1766                 error = -EBUSY;
1767                 goto out_trans_cancel;
1768         }
1769         /*
1770          * Count the number of extended attribute blocks
1771          */
1772         if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1773              (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1774                 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1775                 if (error)
1776                         goto out_trans_cancel;
1777         }
1778         if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1779              (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1780                 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1781                         &taforkblks);
1782                 if (error)
1783                         goto out_trans_cancel;
1784         }
1785
1786         /*
1787          * Before we've swapped the forks, lets set the owners of the forks
1788          * appropriately. We have to do this as we are demand paging the btree
1789          * buffers, and so the validation done on read will expect the owner
1790          * field to be correctly set. Once we change the owners, we can swap the
1791          * inode forks.
1792          *
1793          * Note the trickiness in setting the log flags - we set the owner log
1794          * flag on the opposite inode (i.e. the inode we are setting the new
1795          * owner to be) because once we swap the forks and log that, log
1796          * recovery is going to see the fork as owned by the swapped inode,
1797          * not the pre-swapped inodes.
1798          */
1799         src_log_flags = XFS_ILOG_CORE;
1800         target_log_flags = XFS_ILOG_CORE;
1801         if (ip->i_d.di_version == 3 &&
1802             ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1803                 target_log_flags |= XFS_ILOG_DOWNER;
1804                 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1805                                               tip->i_ino, NULL);
1806                 if (error)
1807                         goto out_trans_cancel;
1808         }
1809
1810         if (tip->i_d.di_version == 3 &&
1811             tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1812                 src_log_flags |= XFS_ILOG_DOWNER;
1813                 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1814                                               ip->i_ino, NULL);
1815                 if (error)
1816                         goto out_trans_cancel;
1817         }
1818
1819         /*
1820          * Swap the data forks of the inodes
1821          */
1822         ifp = &ip->i_df;
1823         tifp = &tip->i_df;
1824         *tempifp = *ifp;        /* struct copy */
1825         *ifp = *tifp;           /* struct copy */
1826         *tifp = *tempifp;       /* struct copy */
1827
1828         /*
1829          * Fix the on-disk inode values
1830          */
1831         tmp = (__uint64_t)ip->i_d.di_nblocks;
1832         ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1833         tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1834
1835         tmp = (__uint64_t) ip->i_d.di_nextents;
1836         ip->i_d.di_nextents = tip->i_d.di_nextents;
1837         tip->i_d.di_nextents = tmp;
1838
1839         tmp = (__uint64_t) ip->i_d.di_format;
1840         ip->i_d.di_format = tip->i_d.di_format;
1841         tip->i_d.di_format = tmp;
1842
1843         /*
1844          * The extents in the source inode could still contain speculative
1845          * preallocation beyond EOF (e.g. the file is open but not modified
1846          * while defrag is in progress). In that case, we need to copy over the
1847          * number of delalloc blocks the data fork in the source inode is
1848          * tracking beyond EOF so that when the fork is truncated away when the
1849          * temporary inode is unlinked we don't underrun the i_delayed_blks
1850          * counter on that inode.
1851          */
1852         ASSERT(tip->i_delayed_blks == 0);
1853         tip->i_delayed_blks = ip->i_delayed_blks;
1854         ip->i_delayed_blks = 0;
1855
1856         switch (ip->i_d.di_format) {
1857         case XFS_DINODE_FMT_EXTENTS:
1858                 /* If the extents fit in the inode, fix the
1859                  * pointer.  Otherwise it's already NULL or
1860                  * pointing to the extent.
1861                  */
1862                 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1863                         ifp->if_u1.if_extents =
1864                                 ifp->if_u2.if_inline_ext;
1865                 }
1866                 src_log_flags |= XFS_ILOG_DEXT;
1867                 break;
1868         case XFS_DINODE_FMT_BTREE:
1869                 ASSERT(ip->i_d.di_version < 3 ||
1870                        (src_log_flags & XFS_ILOG_DOWNER));
1871                 src_log_flags |= XFS_ILOG_DBROOT;
1872                 break;
1873         }
1874
1875         switch (tip->i_d.di_format) {
1876         case XFS_DINODE_FMT_EXTENTS:
1877                 /* If the extents fit in the inode, fix the
1878                  * pointer.  Otherwise it's already NULL or
1879                  * pointing to the extent.
1880                  */
1881                 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1882                         tifp->if_u1.if_extents =
1883                                 tifp->if_u2.if_inline_ext;
1884                 }
1885                 target_log_flags |= XFS_ILOG_DEXT;
1886                 break;
1887         case XFS_DINODE_FMT_BTREE:
1888                 target_log_flags |= XFS_ILOG_DBROOT;
1889                 ASSERT(tip->i_d.di_version < 3 ||
1890                        (target_log_flags & XFS_ILOG_DOWNER));
1891                 break;
1892         }
1893
1894         xfs_trans_log_inode(tp, ip,  src_log_flags);
1895         xfs_trans_log_inode(tp, tip, target_log_flags);
1896
1897         /*
1898          * If this is a synchronous mount, make sure that the
1899          * transaction goes to disk before returning to the user.
1900          */
1901         if (mp->m_flags & XFS_MOUNT_WSYNC)
1902                 xfs_trans_set_sync(tp);
1903
1904         error = xfs_trans_commit(tp, 0);
1905
1906         trace_xfs_swap_extent_after(ip, 0);
1907         trace_xfs_swap_extent_after(tip, 1);
1908 out:
1909         kmem_free(tempifp);
1910         return error;
1911
1912 out_unlock:
1913         xfs_iunlock(ip, lock_flags);
1914         xfs_iunlock(tip, lock_flags);
1915         goto out;
1916
1917 out_trans_cancel:
1918         xfs_trans_cancel(tp, 0);
1919         goto out;
1920 }