These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / hugetlbfs / inode.c
index 87724c1..595ebdb 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>               /* remove ASAP */
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
        {Opt_err,       NULL},
 };
 
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+       vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+                                                       index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+       mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
 static void huge_pagevec_release(struct pagevec *pvec)
 {
        int i;
@@ -130,7 +154,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
                goto out;
 
        ret = 0;
-       hugetlb_prefault_arch_hook(vma->vm_mm);
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
                inode->i_size = len;
 out:
@@ -294,54 +317,134 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
        return -EINVAL;
 }
 
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
 {
        ClearPageDirty(page);
        ClearPageUptodate(page);
        delete_from_page_cache(page);
 }
 
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch.  There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ *     In this case, we first scan the range and release found pages.
+ *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ *     maps and global counts.  Page faults can not race with truncation
+ *     in this routine.  hugetlb_no_page() prevents page faults in the
+ *     truncated range.  It checks i_size before allocation, and again after
+ *     with the page table lock for the page held.  The same lock must be
+ *     acquired to unmap a page.
+ * hole punch is indicated if end is not LLONG_MAX
+ *     In the hole punch case we scan the range and release found pages.
+ *     Only when releasing a page is the associated region/reserv map
+ *     deleted.  The region/reserv map for ranges without associated
+ *     pages are not modified.  Page faults can race with hole punch.
+ *     This is indicated if we find a mapped page.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+                                  loff_t lend)
 {
        struct hstate *h = hstate_inode(inode);
        struct address_space *mapping = &inode->i_data;
        const pgoff_t start = lstart >> huge_page_shift(h);
+       const pgoff_t end = lend >> huge_page_shift(h);
+       struct vm_area_struct pseudo_vma;
        struct pagevec pvec;
        pgoff_t next;
        int i, freed = 0;
+       long lookup_nr = PAGEVEC_SIZE;
+       bool truncate_op = (lend == LLONG_MAX);
 
+       memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pagevec_init(&pvec, 0);
        next = start;
-       while (1) {
-               if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-                       if (next == start)
-                               break;
-                       next = start;
-                       continue;
-               }
+       while (next < end) {
+               /*
+                * Don't grab more pages than the number left in the range.
+                */
+               if (end - next < lookup_nr)
+                       lookup_nr = end - next;
+
+               /*
+                * When no more pages are found, we are done.
+                */
+               if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+                       break;
 
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                       u32 hash;
+
+                       /*
+                        * The page (index) could be beyond end.  This is
+                        * only possible in the punch hole case as end is
+                        * max page offset in the truncate case.
+                        */
+                       next = page->index;
+                       if (next >= end)
+                               break;
+
+                       hash = hugetlb_fault_mutex_hash(h, current->mm,
+                                                       &pseudo_vma,
+                                                       mapping, next, 0);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                        lock_page(page);
-                       if (page->index > next)
-                               next = page->index;
-                       ++next;
-                       truncate_huge_page(page);
+                       if (likely(!page_mapped(page))) {
+                               bool rsv_on_error = !PagePrivate(page);
+                               /*
+                                * We must free the huge page and remove
+                                * from page cache (remove_huge_page) BEFORE
+                                * removing the region/reserve map
+                                * (hugetlb_unreserve_pages).  In rare out
+                                * of memory conditions, removal of the
+                                * region/reserve map could fail.  Before
+                                * free'ing the page, note PagePrivate which
+                                * is used in case of error.
+                                */
+                               remove_huge_page(page);
+                               freed++;
+                               if (!truncate_op) {
+                                       if (unlikely(hugetlb_unreserve_pages(
+                                                       inode, next,
+                                                       next + 1, 1)))
+                                               hugetlb_fix_reserve_counts(
+                                                       inode, rsv_on_error);
+                               }
+                       } else {
+                               /*
+                                * If page is mapped, it was faulted in after
+                                * being unmapped.  It indicates a race between
+                                * hole punch and page fault.  Do nothing in
+                                * this case.  Getting here in a truncate
+                                * operation is a bug.
+                                */
+                               BUG_ON(truncate_op);
+                       }
+
                        unlock_page(page);
-                       freed++;
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
+               ++next;
                huge_pagevec_release(&pvec);
+               cond_resched();
        }
-       BUG_ON(!lstart && mapping->nrpages);
-       hugetlb_unreserve_pages(inode, start, freed);
+
+       if (truncate_op)
+               (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
        struct resv_map *resv_map;
 
-       truncate_hugepages(inode, 0);
+       remove_inode_hugepages(inode, 0, LLONG_MAX);
        resv_map = (struct resv_map *)inode->i_mapping->private_data;
        /* root inode doesn't have the resv_map, so we should check it */
        if (resv_map)
@@ -350,12 +453,17 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 }
 
 static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
 {
        struct vm_area_struct *vma;
 
-       vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+       /*
+        * end == 0 indicates that the entire range after
+        * start should be unmapped.
+        */
+       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
                unsigned long v_offset;
+               unsigned long v_end;
 
                /*
                 * Can the expression below overflow on 32-bit arches?
@@ -363,13 +471,22 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
                 * which overlap the truncated area starting at pgoff,
                 * and no vma on a 32-bit arch can span beyond the 4GB.
                 */
-               if (vma->vm_pgoff < pgoff)
-                       v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+               if (vma->vm_pgoff < start)
+                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
                else
                        v_offset = 0;
 
-               unmap_hugepage_range(vma, vma->vm_start + v_offset,
-                                    vma->vm_end, NULL);
+               if (!end)
+                       v_end = vma->vm_end;
+               else {
+                       v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+                                                       + vma->vm_start;
+                       if (v_end > vma->vm_end)
+                               v_end = vma->vm_end;
+               }
+
+               unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+                                                                       NULL);
        }
 }
 
@@ -385,12 +502,161 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        i_size_write(inode, offset);
        i_mmap_lock_write(mapping);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap))
-               hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
        i_mmap_unlock_write(mapping);
-       truncate_hugepages(inode, offset);
+       remove_inode_hugepages(inode, offset, LLONG_MAX);
        return 0;
 }
 
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct hstate *h = hstate_inode(inode);
+       loff_t hpage_size = huge_page_size(h);
+       loff_t hole_start, hole_end;
+
+       /*
+        * For hole punch round up the beginning offset of the hole and
+        * round down the end.
+        */
+       hole_start = round_up(offset, hpage_size);
+       hole_end = round_down(offset + len, hpage_size);
+
+       if (hole_end > hole_start) {
+               struct address_space *mapping = inode->i_mapping;
+
+               mutex_lock(&inode->i_mutex);
+               i_mmap_lock_write(mapping);
+               if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+                       hugetlb_vmdelete_list(&mapping->i_mmap,
+                                               hole_start >> PAGE_SHIFT,
+                                               hole_end  >> PAGE_SHIFT);
+               i_mmap_unlock_write(mapping);
+               remove_inode_hugepages(inode, hole_start, hole_end);
+               mutex_unlock(&inode->i_mutex);
+       }
+
+       return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+                               loff_t len)
+{
+       struct inode *inode = file_inode(file);
+       struct address_space *mapping = inode->i_mapping;
+       struct hstate *h = hstate_inode(inode);
+       struct vm_area_struct pseudo_vma;
+       struct mm_struct *mm = current->mm;
+       loff_t hpage_size = huge_page_size(h);
+       unsigned long hpage_shift = huge_page_shift(h);
+       pgoff_t start, index, end;
+       int error;
+       u32 hash;
+
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return hugetlbfs_punch_hole(inode, offset, len);
+
+       /*
+        * Default preallocate case.
+        * For this range, start is rounded down and end is rounded up
+        * as well as being converted to page offsets.
+        */
+       start = offset >> hpage_shift;
+       end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+       mutex_lock(&inode->i_mutex);
+
+       /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+       error = inode_newsize_ok(inode, offset + len);
+       if (error)
+               goto out;
+
+       /*
+        * Initialize a pseudo vma as this is required by the huge page
+        * allocation routines.  If NUMA is configured, use page index
+        * as input to create an allocation policy.
+        */
+       memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+       pseudo_vma.vm_file = file;
+
+       for (index = start; index < end; index++) {
+               /*
+                * This is supposed to be the vaddr where the page is being
+                * faulted in, but we have no vaddr here.
+                */
+               struct page *page;
+               unsigned long addr;
+               int avoid_reserve = 0;
+
+               cond_resched();
+
+               /*
+                * fallocate(2) manpage permits EINTR; we may have been
+                * interrupted because we are using up too much memory.
+                */
+               if (signal_pending(current)) {
+                       error = -EINTR;
+                       break;
+               }
+
+               /* Set numa allocation policy based on index */
+               hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+               /* addr is the offset within the file (zero based) */
+               addr = index * hpage_size;
+
+               /* mutex taken here, fault path and hole punch */
+               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+                                               index, addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               /* See if already present in mapping to avoid alloc/free */
+               page = find_get_page(mapping, index);
+               if (page) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_drop_vma_policy(&pseudo_vma);
+                       continue;
+               }
+
+               /* Allocate page and add to page cache */
+               page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+               hugetlb_drop_vma_policy(&pseudo_vma);
+               if (IS_ERR(page)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       error = PTR_ERR(page);
+                       goto out;
+               }
+               clear_huge_page(page, addr, pages_per_huge_page(h));
+               __SetPageUptodate(page);
+               error = huge_add_to_page_cache(page, mapping, index);
+               if (unlikely(error)) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out;
+               }
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+               /*
+                * page_put due to reference from alloc_huge_page()
+                * unlock_page because locked by add_to_page_cache()
+                */
+               put_page(page);
+               unlock_page(page);
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+               i_size_write(inode, offset + len);
+       inode->i_ctime = CURRENT_TIME;
+out:
+       mutex_unlock(&inode->i_mutex);
+       return error;
+}
+
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = d_inode(dentry);
@@ -702,7 +968,8 @@ const struct file_operations hugetlbfs_file_operations = {
        .mmap                   = hugetlbfs_file_mmap,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
-       .llseek         = default_llseek,
+       .llseek                 = default_llseek,
+       .fallocate              = hugetlbfs_fallocate,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -1011,6 +1278,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
                goto out_dentry;
+       if (creat_flags == HUGETLB_SHMFS_INODE)
+               inode->i_flags |= S_PRIVATE;
 
        file = ERR_PTR(-ENOMEM);
        if (hugetlb_reserve_pages(inode, 0,