These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / file.c
index b072e17..0f09526 100644 (file)
@@ -756,8 +756,16 @@ next_slot:
                }
 
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               if (key.objectid > ino ||
-                   key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+               if (key.objectid > ino)
+                       break;
+               if (WARN_ON_ONCE(key.objectid < ino) ||
+                   key.type < BTRFS_EXTENT_DATA_KEY) {
+                       ASSERT(del_nr == 0);
+                       path->slots[0]++;
+                       goto next_slot;
+               }
+               if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
                        break;
 
                fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
                                btrfs_file_extent_inline_len(leaf,
                                                     path->slots[0], fi);
                } else {
-                       WARN_ON(1);
-                       extent_end = search_start;
+                       /* can't happen */
+                       BUG();
                }
 
                /*
@@ -847,7 +855,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               start - extent_offset, 1);
+                                               start - extent_offset);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        key.offset = start;
@@ -925,7 +933,7 @@ delete_extent_item:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                key.objectid, key.offset -
-                                               extent_offset, 0);
+                                               extent_offset);
                                BUG_ON(ret); /* -ENOMEM */
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
@@ -1204,7 +1212,7 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                          ino, orig_offset, 1);
+                                          ino, orig_offset);
                BUG_ON(ret); /* -ENOMEM */
 
                if (split == start) {
@@ -1231,7 +1239,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset, 0);
+                                       ino, orig_offset);
                BUG_ON(ret); /* -ENOMEM */
        }
        other_start = 0;
@@ -1248,7 +1256,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset, 0);
+                                       ino, orig_offset);
                BUG_ON(ret); /* -ENOMEM */
        }
        if (del_nr == 0) {
@@ -1283,7 +1291,8 @@ out:
  * on error we return an unlocked page and the error value
  * on success we return a locked page and 0
  */
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+                                struct page *page, u64 pos,
                                 bool force_uptodate)
 {
        int ret = 0;
@@ -1298,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
                        unlock_page(page);
                        return -EIO;
                }
+               if (page->mapping != inode->i_mapping) {
+                       unlock_page(page);
+                       return -EAGAIN;
+               }
        }
        return 0;
 }
@@ -1316,6 +1329,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
        int faili;
 
        for (i = 0; i < num_pages; i++) {
+again:
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
                                               mask | __GFP_WRITE);
                if (!pages[i]) {
@@ -1325,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
                }
 
                if (i == 0)
-                       err = prepare_uptodate_page(pages[i], pos,
+                       err = prepare_uptodate_page(inode, pages[i], pos,
                                                    force_uptodate);
-               if (i == num_pages - 1)
-                       err = prepare_uptodate_page(pages[i],
+               if (!err && i == num_pages - 1)
+                       err = prepare_uptodate_page(inode, pages[i],
                                                    pos + write_bytes, false);
                if (err) {
                        page_cache_release(pages[i]);
+                       if (err == -EAGAIN) {
+                               err = 0;
+                               goto again;
+                       }
                        faili = i - 1;
                        goto fail;
                }
@@ -1469,7 +1487,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        u64 release_bytes = 0;
        u64 lockstart;
        u64 lockend;
-       unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
@@ -1485,8 +1502,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        if (!pages)
                return -ENOMEM;
 
-       first_index = pos >> PAGE_CACHE_SHIFT;
-
        while (iov_iter_count(i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1525,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
 
                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-               ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
-               if (ret == -ENOSPC &&
-                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-                                             BTRFS_INODE_PREALLOC))) {
+
+               if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                            BTRFS_INODE_PREALLOC)) {
                        ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret < 0)
+                               break;
                        if (ret > 0) {
+                               /*
+                                * For nodata cow case, no need to reserve
+                                * data space.
+                                */
                                only_release_metadata = true;
                                /*
                                 * our prealloc extent may be smaller than
@@ -1524,20 +1544,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                num_pages = DIV_ROUND_UP(write_bytes + offset,
                                                         PAGE_CACHE_SIZE);
                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                               ret = 0;
-                       } else {
-                               ret = -ENOSPC;
+                               goto reserve_metadata;
                        }
                }
-
-               if (ret)
+               ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+               if (ret < 0)
                        break;
 
+reserve_metadata:
                ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
                if (ret) {
                        if (!only_release_metadata)
-                               btrfs_free_reserved_data_space(inode,
-                                                              reserve_bytes);
+                               btrfs_free_reserved_data_space(inode, pos,
+                                                              write_bytes);
                        else
                                btrfs_end_write_no_snapshoting(root);
                        break;
@@ -1603,12 +1622,17 @@ again:
                                BTRFS_I(inode)->outstanding_extents++;
                                spin_unlock(&BTRFS_I(inode)->lock);
                        }
-                       if (only_release_metadata)
+                       if (only_release_metadata) {
                                btrfs_delalloc_release_metadata(inode,
                                                                release_bytes);
-                       else
-                               btrfs_delalloc_release_space(inode,
+                       } else {
+                               u64 __pos;
+
+                               __pos = round_down(pos, root->sectorsize) +
+                                       (dirty_pages << PAGE_CACHE_SHIFT);
+                               btrfs_delalloc_release_space(inode, __pos,
                                                             release_bytes);
+                       }
                }
 
                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1684,7 @@ again:
                        btrfs_end_write_no_snapshoting(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
                } else {
-                       btrfs_delalloc_release_space(inode, release_bytes);
+                       btrfs_delalloc_release_space(inode, pos, release_bytes);
                }
        }
 
@@ -1748,7 +1772,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        }
 
        current->backing_dev_info = inode_to_bdi(inode);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err) {
                mutex_unlock(&inode->i_mutex);
                goto out;
@@ -1868,7 +1892,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct btrfs_log_ctx ctx;
        int ret = 0;
        bool full_sync = 0;
+       u64 len;
 
+       /*
+        * The range length can be represented by u64, we have to do the typecasts
+        * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
+        */
+       len = (u64)end - (u64)start + 1;
        trace_btrfs_sync_file(file, datasync);
 
        /*
@@ -1896,7 +1926,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 * all extents are persisted and the respective file extent
                 * items are in the fs/subvol btree.
                 */
-               ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+               ret = btrfs_wait_ordered_range(inode, start, len);
        } else {
                /*
                 * Start any new ordered operations before starting to log the
@@ -1968,8 +1998,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        smp_mb();
        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-           (full_sync && BTRFS_I(inode)->last_trans <=
-            root->fs_info->last_trans_committed)) {
+           (BTRFS_I(inode)->last_trans <=
+            root->fs_info->last_trans_committed &&
+            (full_sync ||
+             !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
                /*
                 * We'v had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
@@ -2054,8 +2086,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                        }
                }
                if (!full_sync) {
-                       ret = btrfs_wait_ordered_range(inode, start,
-                                                      end - start + 1);
+                       ret = btrfs_wait_ordered_range(inode, start, len);
                        if (ret) {
                                btrfs_end_transaction(trans, root);
                                goto out;
@@ -2263,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        u64 drop_end;
        int ret = 0;
        int err = 0;
-       int rsv_count;
+       unsigned int rsv_count;
        bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
@@ -2484,6 +2515,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        }
 
        trans->block_rsv = &root->fs_info->trans_block_rsv;
+       /*
+        * If we are using the NO_HOLES feature we might have had already an
+        * hole that overlaps a part of the region [lockstart, lockend] and
+        * ends at (or beyond) lockend. Since we have no file extent items to
+        * represent holes, drop_end can be less than lockend and so we must
+        * make sure we have an extent map representing the existing hole (the
+        * call to __btrfs_drop_extents() might have dropped the existing extent
+        * map representing the existing hole), otherwise the fast fsync path
+        * will not record the existence of the hole region
+        * [existing_hole_start, lockend].
+        */
+       if (drop_end <= lockend)
+               drop_end = lockend + 1;
        /*
         * Don't insert file hole extent item if it's for a range beyond eof
         * (because it's useless) or if it represents a 0 bytes range (when
@@ -2538,17 +2582,61 @@ out_only_mutex:
        return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+       struct falloc_range *prev = NULL;
+       struct falloc_range *range = NULL;
+
+       if (list_empty(head))
+               goto insert;
+
+       /*
+        * As fallocate iterate by bytenr order, we only need to check
+        * the last range.
+        */
+       prev = list_entry(head->prev, struct falloc_range, list);
+       if (prev->start + prev->len == start) {
+               prev->len += len;
+               return 0;
+       }
+insert:
+       range = kmalloc(sizeof(*range), GFP_NOFS);
+       if (!range)
+               return -ENOMEM;
+       range->start = start;
+       range->len = len;
+       list_add_tail(&range->list, head);
+       return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        struct extent_state *cached_state = NULL;
+       struct falloc_range *range;
+       struct falloc_range *tmp;
+       struct list_head reserve_list;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
+       u64 actual_end = 0;
        struct extent_map *em;
        int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
@@ -2564,11 +2652,12 @@ static long btrfs_fallocate(struct file *file, int mode,
                return btrfs_punch_hole(inode, offset, len);
 
        /*
-        * Make sure we have enough space before we do the
-        * allocation.
+        * Only trigger disk allocation, don't trigger qgroup reserve
+        *
+        * For qgroup space, it will be checked later.
         */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
-       if (ret)
+       ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+       if (ret < 0)
                return ret;
 
        mutex_lock(&inode->i_mutex);
@@ -2576,12 +2665,19 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret)
                goto out;
 
+       /*
+        * TODO: Move these two operations after we have checked
+        * accurate reserved space, or fallocate can still fail but
+        * with page truncated or size expanded.
+        *
+        * But that's a minor problem and won't do much harm BTW.
+        */
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, i_size_read(inode),
                                        alloc_start);
                if (ret)
                        goto out;
-       } else {
+       } else if (offset + len > inode->i_size) {
                /*
                 * If we are fallocating from the end of the file onward we
                 * need to zero out the end of the page if i_size lands in the
@@ -2634,10 +2730,10 @@ static long btrfs_fallocate(struct file *file, int mode,
                }
        }
 
+       /* First, check if we exceed the qgroup limit */
+       INIT_LIST_HEAD(&reserve_list);
        cur_offset = alloc_start;
        while (1) {
-               u64 actual_end;
-
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                if (IS_ERR_OR_NULL(em)) {
@@ -2650,57 +2746,82 @@ static long btrfs_fallocate(struct file *file, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = ALIGN(last_byte, blocksize);
-
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                       last_byte - cur_offset,
-                                                       1 << inode->i_blkbits,
-                                                       offset + len,
-                                                       &alloc_hint);
-               } else if (actual_end > inode->i_size &&
-                          !(mode & FALLOC_FL_KEEP_SIZE)) {
-                       struct btrfs_trans_handle *trans;
-                       struct btrfs_root *root = BTRFS_I(inode)->root;
-
-                       /*
-                        * We didn't need to allocate any more space, but we
-                        * still extended the size of the file so we need to
-                        * update i_size and the inode item.
-                        */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                       } else {
-                               inode->i_ctime = CURRENT_TIME;
-                               i_size_write(inode, actual_end);
-                               btrfs_ordered_update_i_size(inode, actual_end,
-                                                           NULL);
-                               ret = btrfs_update_inode(trans, root, inode);
-                               if (ret)
-                                       btrfs_end_transaction(trans, root);
-                               else
-                                       ret = btrfs_end_transaction(trans,
-                                                                   root);
+                       ret = add_falloc_range(&reserve_list, cur_offset,
+                                              last_byte - cur_offset);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
                        }
+                       ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+                                       last_byte - cur_offset);
+                       if (ret < 0)
+                               break;
                }
                free_extent_map(em);
-               if (ret < 0)
-                       break;
-
                cur_offset = last_byte;
-               if (cur_offset >= alloc_end) {
-                       ret = 0;
+               if (cur_offset >= alloc_end)
                        break;
+       }
+
+       /*
+        * If ret is still 0, means we're OK to fallocate.
+        * Or just cleanup the list and exit.
+        */
+       list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+               if (!ret)
+                       ret = btrfs_prealloc_file_range(inode, mode,
+                                       range->start,
+                                       range->len, 1 << inode->i_blkbits,
+                                       offset + len, &alloc_hint);
+               list_del(&range->list);
+               kfree(range);
+       }
+       if (ret < 0)
+               goto out_unlock;
+
+       if (actual_end > inode->i_size &&
+           !(mode & FALLOC_FL_KEEP_SIZE)) {
+               struct btrfs_trans_handle *trans;
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+
+               /*
+                * We didn't need to allocate any more space, but we
+                * still extended the size of the file so we need to
+                * update i_size and the inode item.
+                */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+               } else {
+                       inode->i_ctime = CURRENT_TIME;
+                       i_size_write(inode, actual_end);
+                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
+                       ret = btrfs_update_inode(trans, root, inode);
+                       if (ret)
+                               btrfs_end_transaction(trans, root);
+                       else
+                               ret = btrfs_end_transaction(trans, root);
                }
        }
+out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
 out:
+       /*
+        * As we waited the extent range, the data_rsv_map must be empty
+        * in the range, as written data range will be released from it.
+        * And for prealloacted extent, it will also be released when
+        * its metadata is written.
+        * So this is completely used as cleanup.
+        */
+       btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_start,
+                                      alloc_end - alloc_start);
        return ret;
 }