These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / xfs / xfs_file.c
index 3b75912..f5392ab 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -79,14 +80,15 @@ xfs_rw_ilock_demote(
 }
 
 /*
- *     xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
  *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
  */
 int
 xfs_iozero(
@@ -96,7 +98,8 @@ xfs_iozero(
 {
        struct page             *page;
        struct address_space    *mapping;
-       int                     status;
+       int                     status = 0;
+
 
        mapping = VFS_I(ip)->i_mapping;
        do {
@@ -108,20 +111,27 @@ xfs_iozero(
                if (bytes > count)
                        bytes = count;
 
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
+               if (IS_DAX(VFS_I(ip))) {
+                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                                    xfs_get_blocks_direct);
+                       if (status)
+                               break;
+               } else {
+                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                               AOP_FLAG_UNINTERRUPTIBLE,
+                                               &page, &fsdata);
+                       if (status)
+                               break;
 
-               zero_user(page, offset, bytes);
+                       zero_user(page, offset, bytes);
 
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = pagecache_write_end(NULL, mapping, pos, bytes,
+                                               bytes, page, fsdata);
+                       WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = 0;
+               }
                pos += bytes;
                count -= bytes;
-               status = 0;
        } while (count);
 
        return status;
@@ -138,7 +148,7 @@ xfs_update_prealloc_flags(
        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -160,7 +170,7 @@ xfs_update_prealloc_flags(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (flags & XFS_PREALLOC_SYNC)
                xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
 
 /*
@@ -232,19 +242,30 @@ xfs_file_fsync(
        }
 
        /*
-        * All metadata updates are logged, which means that we just have
-        * to flush the log up to the latest LSN that touched the inode.
+        * All metadata updates are logged, which means that we just have to
+        * flush the log up to the latest LSN that touched the inode. If we have
+        * concurrent fsync/fdatasync() calls, we need them to all block on the
+        * log force before we clear the ili_fsync_fields field. This ensures
+        * that we don't get a racing sync operation that does not wait for the
+        * metadata to hit the journal before returning. If we race with
+        * clearing the ili_fsync_fields, then all that will happen is the log
+        * force will do nothing as the lsn will already be on disk. We can't
+        * race with setting ili_fsync_fields because that is done under
+        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+        * until after the ili_fsync_fields is cleared.
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        if (xfs_ipincount(ip)) {
                if (!datasync ||
-                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                        lsn = ip->i_itemp->ili_last_lsn;
        }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-       if (lsn)
+       if (lsn) {
                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+               ip->i_itemp->ili_fsync_fields = 0;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
        /*
         * If we only have a single device, and the log force about was
@@ -277,14 +298,14 @@ xfs_file_read_iter(
        xfs_fsize_t             n;
        loff_t                  pos = iocb->ki_pos;
 
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(mp, xs_read_calls);
 
        if (unlikely(iocb->ki_flags & IOCB_DIRECT))
                ioflags |= XFS_IO_ISDIRECT;
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
 
-       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -307,24 +328,33 @@ xfs_file_read_iter(
                return -EIO;
 
        /*
-        * Locking is a bit tricky here. If we take an exclusive lock
-        * for direct IO, we effectively serialise all new concurrent
-        * read IO to this file and block it behind IO that is currently in
-        * progress because IO in progress holds the IO lock shared. We only
-        * need to hold the lock exclusive to blow away the page cache, so
-        * only take lock exclusively if the page cache needs invalidation.
-        * This allows the normal direct IO case of no page cache pages to
-        * proceeed concurrently without serialisation.
+        * Locking is a bit tricky here. If we take an exclusive lock for direct
+        * IO, we effectively serialise all new concurrent read IO to this file
+        * and block it behind IO that is currently in progress because IO in
+        * progress holds the IO lock shared. We only need to hold the lock
+        * exclusive to blow away the page cache, so only take lock exclusively
+        * if the page cache needs invalidation. This allows the normal direct
+        * IO case of no page cache pages to proceeed concurrently without
+        * serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
+               /*
+                * The generic dio code only flushes the range of the particular
+                * I/O. Because we take an exclusive lock here, this whole
+                * sequence is considerably more expensive for us. This has a
+                * noticeable performance impact for any file with cached pages,
+                * even when outside of the range of the particular I/O.
+                *
+                * Hence, amortize the cost of the lock against a full file
+                * flush and reduce the chances of repeated iolock cycles going
+                * forward.
+                */
                if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait_range(
-                                                       VFS_I(ip)->i_mapping,
-                                                       pos, pos + size - 1);
+                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
@@ -335,9 +365,7 @@ xfs_file_read_iter(
                         * we fail to invalidate a page, but this should never
                         * happen on XFS. Warn if it does fail.
                         */
-                       ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       (pos + size - 1) >> PAGE_CACHE_SHIFT);
+                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                        WARN_ON_ONCE(ret);
                        ret = 0;
                }
@@ -348,7 +376,7 @@ xfs_file_read_iter(
 
        ret = generic_file_read_iter(iocb, to);
        if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(mp, xs_read_bytes, ret);
 
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
@@ -366,7 +394,7 @@ xfs_file_splice_read(
        int                     ioflags = 0;
        ssize_t                 ret;
 
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(ip->i_mount, xs_read_calls);
 
        if (infilp->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
@@ -378,9 +406,13 @@ xfs_file_splice_read(
 
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       /* for dax, we need to avoid the page cache */
+       if (IS_DAX(VFS_I(ip)))
+               ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+       else
+               ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
        if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
 
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
@@ -461,6 +493,8 @@ xfs_zero_eof(
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(offset > isize);
 
+       trace_xfs_zero_eof(ip, isize, offset - isize);
+
        /*
         * First handle zeroing the block on which isize resides.
         *
@@ -553,6 +587,7 @@ xfs_file_aio_write_checks(
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 error = 0;
        size_t                  count = iov_iter_count(from);
+       bool                    drained_dio = false;
 
 restart:
        error = generic_write_checks(iocb, from);
@@ -563,6 +598,13 @@ restart:
        if (error)
                return error;
 
+       /* For changing security info in file_remove_privs() we need i_mutex */
+       if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+               xfs_rw_iunlock(ip, *iolock);
+               *iolock = XFS_IOLOCK_EXCL;
+               xfs_rw_ilock(ip, *iolock);
+               goto restart;
+       }
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
@@ -583,12 +625,13 @@ restart:
                bool    zero = false;
 
                spin_unlock(&ip->i_flags_lock);
-               if (*iolock == XFS_IOLOCK_SHARED) {
-                       xfs_rw_iunlock(ip, *iolock);
-                       *iolock = XFS_IOLOCK_EXCL;
-                       xfs_rw_ilock(ip, *iolock);
-                       iov_iter_reexpand(from, count);
-
+               if (!drained_dio) {
+                       if (*iolock == XFS_IOLOCK_SHARED) {
+                               xfs_rw_iunlock(ip, *iolock);
+                               *iolock = XFS_IOLOCK_EXCL;
+                               xfs_rw_ilock(ip, *iolock);
+                               iov_iter_reexpand(from, count);
+                       }
                        /*
                         * We now have an IO submission barrier in place, but
                         * AIO can do EOF updates during IO completion and hence
@@ -598,6 +641,7 @@ restart:
                         * no-op.
                         */
                        inode_dio_wait(inode);
+                       drained_dio = true;
                        goto restart;
                }
                error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -623,7 +667,9 @@ restart:
         * setgid bits if the process is not being run by root.  This keeps
         * people from modifying setuid and setgid binaries.
         */
-       return file_remove_suid(file);
+       if (!IS_NOSEC(inode))
+               return file_remove_privs(file);
+       return 0;
 }
 
 /*
@@ -672,7 +718,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
        /* DIO must be aligned to device logical sector size */
-       if ((pos | count) & target->bt_logical_sectormask)
+       if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
@@ -710,19 +756,19 @@ xfs_file_dio_aio_write(
        pos = iocb->ki_pos;
        end = pos + count - 1;
 
+       /*
+        * See xfs_file_read_iter() for why we do a full-file flush here.
+        */
        if (mapping->nrpages) {
-               ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                  pos, end);
+               ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                if (ret)
                        goto out;
                /*
-                * Invalidate whole pages. This can return an error if
-                * we fail to invalidate a page, but this should never
-                * happen on XFS. Warn if it does fail.
+                * Invalidate whole pages. This can return an error if we fail
+                * to invalidate a page, but this should never happen on XFS.
+                * Warn if it does fail.
                 */
-               ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       end >> PAGE_CACHE_SHIFT);
+               ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -758,8 +804,11 @@ xfs_file_dio_aio_write(
 out:
        xfs_rw_iunlock(ip, iolock);
 
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
+       /*
+        * No fallback to buffered IO on errors for XFS. DAX can result in
+        * partial writes, but direct IO will either complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
        return ret;
 }
 
@@ -834,7 +883,7 @@ xfs_file_write_iter(
        ssize_t                 ret;
        size_t                  ocount = iov_iter_count(from);
 
-       XFS_STATS_INC(xs_write_calls);
+       XFS_STATS_INC(ip->i_mount, xs_write_calls);
 
        if (ocount == 0)
                return 0;
@@ -842,7 +891,7 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@ -850,7 +899,7 @@ xfs_file_write_iter(
        if (ret > 0) {
                ssize_t err;
 
-               XFS_STATS_ADD(xs_write_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
                /* Handle various SYNC-type writes */
                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1063,17 +1112,6 @@ xfs_file_readdir(
        return xfs_readdir(ip, ctx, bufsize);
 }
 
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-
-       file_accessed(filp);
-       return 0;
-}
-
 /*
  * This type is designed to indicate the type of offset we would like
  * to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1492,166 @@ xfs_file_llseek(
  * ordering of:
  *
  * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmaplock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
  */
+STATIC int
+xfs_filemap_page_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
+
+       trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+       if (IS_DAX(inode)) {
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+       } else {
+               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = block_page_mkwrite_return(ret);
+       }
+
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
 STATIC int
 xfs_filemap_fault(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
 
-       trace_xfs_filemap_fault(ip);
+       trace_xfs_filemap_fault(XFS_I(inode));
 
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = filemap_fault(vma, vmf);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       /* DAX can shortcut the normal fault path on write faults! */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
+               return xfs_filemap_page_mkwrite(vma, vmf);
 
-       return error;
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       if (IS_DAX(inode)) {
+               /*
+                * we do not want to trigger unwritten extent conversion on read
+                * faults - that is unnecessary overhead and would also require
+                * changes to xfs_get_blocks_direct() to map unwritten extent
+                * ioend for conversion on read-only mappings.
+                */
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+       } else
+               ret = filemap_fault(vma, vmf);
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+       return ret;
 }
 
 /*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
  */
 STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_pmd_fault(
+       struct vm_area_struct   *vma,
+       unsigned long           addr,
+       pmd_t                   *pmd,
+       unsigned int            flags)
+{
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret;
+
+       if (!IS_DAX(inode))
+               return VM_FAULT_FALLBACK;
+
+       trace_xfs_filemap_pmd_fault(ip);
+
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+                             NULL);
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
 
-       trace_xfs_filemap_page_mkwrite(ip);
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret = VM_FAULT_NOPAGE;
+       loff_t                  size;
+
+       trace_xfs_filemap_pfn_mkwrite(ip);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
 
+       /* check if the faulting page hasn't raced with truncate */
        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+       return ret;
 
-       return error;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = xfs_filemap_fault,
+       .pmd_fault      = xfs_filemap_pmd_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
+       .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       file_accessed(filp);
+       vma->vm_ops = &xfs_file_vm_ops;
+       if (IS_DAX(file_inode(filp)))
+               vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+       return 0;
 }
 
 const struct file_operations xfs_file_operations = {
@@ -1526,9 +1682,3 @@ const struct file_operations xfs_dir_file_operations = {
 #endif
        .fsync          = xfs_dir_fsync,
 };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = xfs_filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_filemap_page_mkwrite,
-};