These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / fs / btrfs / ioctl.c
diff --git a/kernel/fs/btrfs/ioctl.c b/kernel/fs/btrfs/ioctl.c

index 37d456a..f07d01b 100644 (file)
--- a/kernel/fs/btrfs/ioctl.c
+++ b/kernel/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
  
  
  static int btrfs_clone(struct inode *src, struct inode *inode,
-                      u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+                      u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+                      int no_time_update);
  
  /* Mask out flags that are inappropriate for the given type of inode. */
  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir,
         key.offset = (u64)-1;
         new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
         if (IS_ERR(new_root)) {
-               btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
                 ret = PTR_ERR(new_root);
+               btrfs_abort_transaction(trans, root, ret);
                 goto fail;
         }
  
@@ -567,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,
                 goto fail;
         }
  
+       mutex_lock(&new_root->objectid_mutex);
+       new_root->highest_objectid = new_dirid;
+       mutex_unlock(&new_root->objectid_mutex);
+
         /*
          * insert the directory item
          */
@@ -1029,6 +1034,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
         struct extent_map *em;
         int ret = 1;
         bool next_mergeable = true;
+       bool prev_mergeable = true;
  
         /*
          * make sure that once we start defragging an extent, we keep on
@@ -1049,13 +1055,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
                 goto out;
         }
  
+       if (!*defrag_end)
+               prev_mergeable = false;
+
         next_mergeable = defrag_check_next_extent(inode, em);
         /*
          * we hit a real extent, if it is big or the next extent is not a
          * real extent, don't bother defragging it
          */
         if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-           (em->len >= thresh || !next_mergeable))
+           (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
                 ret = 0;
  out:
         /*
@@ -1115,7 +1124,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
         page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
  
         ret = btrfs_delalloc_reserve_space(inode,
-                                          page_cnt << PAGE_CACHE_SHIFT);
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
         if (ret)
                 return ret;
         i_done = 0;
@@ -1205,7 +1215,8 @@ again:
                 BTRFS_I(inode)->outstanding_extents++;
                 spin_unlock(&BTRFS_I(inode)->lock);
                 btrfs_delalloc_release_space(inode,
-                                    (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+                               start_index << PAGE_CACHE_SHIFT,
+                               (page_cnt - i_done) << PAGE_CACHE_SHIFT);
         }
  
  
@@ -1230,7 +1241,9 @@ out:
                 unlock_page(pages[i]);
                 page_cache_release(pages[i]);
         }
-       btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+       btrfs_delalloc_release_space(inode,
+                       start_index << PAGE_CACHE_SHIFT,
+                       page_cnt << PAGE_CACHE_SHIFT);
         return ret;
  
  }
@@ -1318,7 +1331,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                 i = range->start >> PAGE_CACHE_SHIFT;
         }
         if (!max_to_defrag)
-               max_to_defrag = last_index + 1;
+               max_to_defrag = last_index - i + 1;
  
         /*
          * make writeback starts from i, so the defrag range can be
@@ -1337,7 +1350,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         break;
  
                 if (btrfs_defrag_cancelled(root->fs_info)) {
-                       printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+                       btrfs_debug(root->fs_info, "defrag_file cancelled");
                         ret = -EAGAIN;
                         break;
                 }
@@ -1368,7 +1381,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         ra_index = max(i, ra_index);
                         btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
                                        cluster);
-                       ra_index += max_cluster;
+                       ra_index += cluster;
                 }
  
                 mutex_lock(&inode->i_mutex);
@@ -1574,7 +1587,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
         new_size = div_u64(new_size, root->sectorsize);
         new_size *= root->sectorsize;
  
-       printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+       btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
                       rcu_str_deref(device->name), new_size);
  
         if (new_size > old_size) {
@@ -1932,6 +1945,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
         u64 found_transid;
         struct extent_buffer *leaf;
         struct btrfs_ioctl_search_header sh;
+       struct btrfs_key test;
         unsigned long item_off;
         unsigned long item_len;
         int nritems;
@@ -2015,12 +2029,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
         }
  advance_key:
         ret = 0;
-       if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+       test.objectid = sk->max_objectid;
+       test.type = sk->max_type;
+       test.offset = sk->max_offset;
+       if (btrfs_comp_cpu_keys(key, &test) >= 0)
+               ret = 1;
+       else if (key->offset < (u64)-1)
                 key->offset++;
-       else if (key->type < (u8)-1 && key->type < sk->max_type) {
+       else if (key->type < (u8)-1) {
                 key->offset = 0;
                 key->type++;
-       } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+       } else if (key->objectid < (u64)-1) {
                 key->offset = 0;
                 key->type = 0;
                 key->objectid++;
@@ -2070,7 +2089,7 @@ static noinline int search_ioctl(struct inode *inode,
                 key.offset = (u64)-1;
                 root = btrfs_read_fs_root_no_name(info, &key);
                 if (IS_ERR(root)) {
-                       printk(KERN_ERR "BTRFS: could not find root %llu\n",
+                       btrfs_err(info, "could not find root %llu",
                                sk->tree_id);
                         btrfs_free_path(path);
                         return -ENOENT;
@@ -2210,7 +2229,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
         key.offset = (u64)-1;
         root = btrfs_read_fs_root_no_name(info, &key);
         if (IS_ERR(root)) {
-               printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+               btrfs_err(info, "could not find root %llu", tree_id);
                 ret = -ENOENT;
                 goto out;
         }
@@ -2271,10 +2290,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
  {
          struct btrfs_ioctl_ino_lookup_args *args;
          struct inode *inode;
-        int ret;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
+       int ret = 0;
  
         args = memdup_user(argp, sizeof(*args));
         if (IS_ERR(args))
@@ -2282,13 +2298,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
  
         inode = file_inode(file);
  
+       /*
+        * Unprivileged query to obtain the containing subvolume root id. The
+        * path is reset so it's consistent with btrfs_search_path_in_tree.
+        */
         if (args->treeid == 0)
                 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
  
+       if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+               args->name[0] = 0;
+               goto out;
+       }
+
+       if (!capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out;
+       }
+
         ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
                                         args->treeid, args->objectid,
                                         args->name);
  
+out:
         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
                 ret = -EFAULT;
  
@@ -2676,7 +2707,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
  {
         struct btrfs_ioctl_fs_info_args *fi_args;
         struct btrfs_device *device;
-       struct btrfs_device *next;
         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
         int ret = 0;
  
@@ -2688,7 +2718,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
         fi_args->num_devices = fs_devices->num_devices;
         memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
  
-       list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (device->devid > fi_args->max_id)
                         fi_args->max_id = device->devid;
         }
@@ -2753,38 +2783,68 @@ out:
         return ret;
  }
  
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
  {
         struct page *page;
-       pgoff_t index;
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
-
-       index = off >> PAGE_CACHE_SHIFT;
  
         page = grab_cache_page(inode->i_mapping, index);
         if (!page)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
  
         if (!PageUptodate(page)) {
-               if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
-                                                0))
-                       return NULL;
+               int ret;
+
+               ret = btrfs_readpage(NULL, page);
+               if (ret)
+                       return ERR_PTR(ret);
                 lock_page(page);
                 if (!PageUptodate(page)) {
                         unlock_page(page);
                         page_cache_release(page);
-                       return NULL;
+                       return ERR_PTR(-EIO);
+               }
+               if (page->mapping != inode->i_mapping) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       return ERR_PTR(-EAGAIN);
                 }
         }
-       unlock_page(page);
  
         return page;
  }
  
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+                              int num_pages, u64 off)
+{
+       int i;
+       pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+       for (i = 0; i < num_pages; i++) {
+again:
+               pages[i] = extent_same_get_page(inode, index + i);
+               if (IS_ERR(pages[i])) {
+                       int err = PTR_ERR(pages[i]);
+
+                       if (err == -EAGAIN)
+                               goto again;
+                       pages[i] = NULL;
+                       return err;
+               }
+       }
+       return 0;
+}
+
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+                            bool retry_range_locking)
  {
-       /* do any pending delalloc/csum calc on src, one way or
-          another, and lock file content */
+       /*
+        * Do any pending delalloc/csum calculations on inode, one way or
+        * another, and lock file content.
+        * The locking order is:
+        *
+        *   1) pages
+        *   2) range in the inode's io tree
+        */
         while (1) {
                 struct btrfs_ordered_extent *ordered;
                 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2802,56 +2862,140 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
                 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
                 if (ordered)
                         btrfs_put_ordered_extent(ordered);
+               if (!retry_range_locking)
+                       return -EAGAIN;
                 btrfs_wait_ordered_range(inode, off, len);
         }
+       return 0;
  }
  
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
-                               struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
  {
-       unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
-       unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
         mutex_unlock(&inode1->i_mutex);
         mutex_unlock(&inode2->i_mutex);
  }
  
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
-                             struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
  {
+       if (inode1 < inode2)
+               swap(inode1, inode2);
+
+       mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+       mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+                                     struct inode *inode2, u64 loff2, u64 len)
+{
+       unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+       unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+                                   struct inode *inode2, u64 loff2, u64 len,
+                                   bool retry_range_locking)
+{
+       int ret;
+
         if (inode1 < inode2) {
                 swap(inode1, inode2);
                 swap(loff1, loff2);
         }
+       ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+       if (ret)
+               return ret;
+       ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+       if (ret)
+               unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+                             loff1 + len - 1);
+       return ret;
+}
  
-       mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-       lock_extent_range(inode1, loff1, len);
-       if (inode1 != inode2) {
-               mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-               lock_extent_range(inode2, loff2, len);
+struct cmp_pages {
+       int             num_pages;
+       struct page     **src_pages;
+       struct page     **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+       int i;
+       struct page *pg;
+
+       for (i = 0; i < cmp->num_pages; i++) {
+               pg = cmp->src_pages[i];
+               if (pg) {
+                       unlock_page(pg);
+                       page_cache_release(pg);
+               }
+               pg = cmp->dst_pages[i];
+               if (pg) {
+                       unlock_page(pg);
+                       page_cache_release(pg);
+               }
         }
+       kfree(cmp->src_pages);
+       kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+                                 struct inode *dst, u64 dst_loff,
+                                 u64 len, struct cmp_pages *cmp)
+{
+       int ret;
+       int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+       struct page **src_pgarr, **dst_pgarr;
+
+       /*
+        * We must gather up all the pages before we initiate our
+        * extent locking. We use an array for the page pointers. Size
+        * of the array is bounded by len, which is in turn bounded by
+        * BTRFS_MAX_DEDUPE_LEN.
+        */
+       src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+       dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+       if (!src_pgarr || !dst_pgarr) {
+               kfree(src_pgarr);
+               kfree(dst_pgarr);
+               return -ENOMEM;
+       }
+       cmp->num_pages = num_pages;
+       cmp->src_pages = src_pgarr;
+       cmp->dst_pages = dst_pgarr;
+
+       ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+       if (ret)
+               goto out;
+
+       ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+
+out:
+       if (ret)
+               btrfs_cmp_data_free(cmp);
+       return 0;
  }
  
  static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
-                         u64 dst_loff, u64 len)
+                         u64 dst_loff, u64 len, struct cmp_pages *cmp)
  {
         int ret = 0;
+       int i;
         struct page *src_page, *dst_page;
         unsigned int cmp_len = PAGE_CACHE_SIZE;
         void *addr, *dst_addr;
  
+       i = 0;
         while (len) {
                 if (len < PAGE_CACHE_SIZE)
                         cmp_len = len;
  
-               src_page = extent_same_get_page(src, loff);
-               if (!src_page)
-                       return -EINVAL;
-               dst_page = extent_same_get_page(dst, dst_loff);
-               if (!dst_page) {
-                       page_cache_release(src_page);
-                       return -EINVAL;
-               }
+               BUG_ON(i >= cmp->num_pages);
+
+               src_page = cmp->src_pages[i];
+               dst_page = cmp->dst_pages[i];
+               ASSERT(PageLocked(src_page));
+               ASSERT(PageLocked(dst_page));
+
                 addr = kmap_atomic(src_page);
                 dst_addr = kmap_atomic(dst_page);
  
@@ -2863,26 +3007,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
  
                 kunmap_atomic(addr);
                 kunmap_atomic(dst_addr);
-               page_cache_release(src_page);
-               page_cache_release(dst_page);
  
                 if (ret)
                         break;
  
-               loff += cmp_len;
-               dst_loff += cmp_len;
                 len -= cmp_len;
+               i++;
         }
  
         return ret;
  }
  
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+                                    u64 olen)
  {
+       u64 len = *plen;
         u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
  
-       if (off + len > inode->i_size || off + len < off)
+       if (off + olen > inode->i_size || off + olen < off)
                 return -EINVAL;
+
+       /* if we extend to eof, continue to block boundary */
+       if (off + len == inode->i_size)
+               *plen = len = ALIGN(inode->i_size, bs) - off;
+
         /* Check that we are block aligned - btrfs_clone() requires this */
         if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
                 return -EINVAL;
@@ -2890,31 +3038,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
         return 0;
  }
  
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                              struct inode *dst, u64 dst_loff)
  {
         int ret;
+       u64 len = olen;
+       struct cmp_pages cmp;
+       int same_inode = 0;
+       u64 same_lock_start = 0;
+       u64 same_lock_len = 0;
  
-       /*
-        * btrfs_clone() can't handle extents in the same file
-        * yet. Once that works, we can drop this check and replace it
-        * with a check for the same inode, but overlapping extents.
-        */
         if (src == dst)
-               return -EINVAL;
+               same_inode = 1;
  
         if (len == 0)
                 return 0;
  
-       btrfs_double_lock(src, loff, dst, dst_loff, len);
+       if (same_inode) {
+               mutex_lock(&src->i_mutex);
  
-       ret = extent_same_check_offsets(src, loff, len);
-       if (ret)
-               goto out_unlock;
+               ret = extent_same_check_offsets(src, loff, &len, olen);
+               if (ret)
+                       goto out_unlock;
  
-       ret = extent_same_check_offsets(dst, dst_loff, len);
-       if (ret)
-               goto out_unlock;
+               /*
+                * Single inode case wants the same checks, except we
+                * don't want our length pushed out past i_size as
+                * comparing that data range makes no sense.
+                *
+                * extent_same_check_offsets() will do this for an
+                * unaligned length at i_size, so catch it here and
+                * reject the request.
+                *
+                * This effectively means we require aligned extents
+                * for the single-inode case, whereas the other cases
+                * allow an unaligned length so long as it ends at
+                * i_size.
+                */
+               if (len != olen) {
+                       ret = -EINVAL;
+                       goto out_unlock;
+               }
+
+               /* Check for overlapping ranges */
+               if (dst_loff + len > loff && dst_loff < loff + len) {
+                       ret = -EINVAL;
+                       goto out_unlock;
+               }
+
+               same_lock_start = min_t(u64, loff, dst_loff);
+               same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+       } else {
+               btrfs_double_inode_lock(src, dst);
+
+               ret = extent_same_check_offsets(src, loff, &len, olen);
+               if (ret)
+                       goto out_unlock;
+
+               ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+               if (ret)
+                       goto out_unlock;
+       }
  
         /* don't make the dst file partly checksummed */
         if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -2923,12 +3107,64 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
                 goto out_unlock;
         }
  
-       ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+again:
+       ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+       if (ret)
+               goto out_unlock;
+
+       if (same_inode)
+               ret = lock_extent_range(src, same_lock_start, same_lock_len,
+                                       false);
+       else
+               ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+                                              false);
+       /*
+        * If one of the inodes has dirty pages in the respective range or
+        * ordered extents, we need to flush dellaloc and wait for all ordered
+        * extents in the range. We must unlock the pages and the ranges in the
+        * io trees to avoid deadlocks when flushing delalloc (requires locking
+        * pages) and when waiting for ordered extents to complete (they require
+        * range locking).
+        */
+       if (ret == -EAGAIN) {
+               /*
+                * Ranges in the io trees already unlocked. Now unlock all
+                * pages before waiting for all IO to complete.
+                */
+               btrfs_cmp_data_free(&cmp);
+               if (same_inode) {
+                       btrfs_wait_ordered_range(src, same_lock_start,
+                                                same_lock_len);
+               } else {
+                       btrfs_wait_ordered_range(src, loff, len);
+                       btrfs_wait_ordered_range(dst, dst_loff, len);
+               }
+               goto again;
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               btrfs_cmp_data_free(&cmp);
+               return ret;
+       }
+
+       /* pass original length for comparison so we stay within i_size */
+       ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
         if (ret == 0)
-               ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+               ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
  
+       if (same_inode)
+               unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+                             same_lock_start + same_lock_len - 1);
+       else
+               btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+       btrfs_cmp_data_free(&cmp);
  out_unlock:
-       btrfs_double_unlock(src, loff, dst, dst_loff, len);
+       if (same_inode)
+               mutex_unlock(&src->i_mutex);
+       else
+               btrfs_double_inode_unlock(src, dst);
  
         return ret;
  }
@@ -3043,52 +3279,19 @@ out:
         return ret;
  }
  
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr.  If it does then we need to update the quota for this root.  This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    u64 disko)
-{
-       struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
-       struct ulist *roots;
-       struct ulist_iterator uiter;
-       struct ulist_node *root_node = NULL;
-       int ret;
-
-       if (!root->fs_info->quota_enabled)
-               return 1;
-
-       btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-       ret = btrfs_find_all_roots(trans, root->fs_info, disko,
-                                  tree_mod_seq_elem.seq, &roots);
-       if (ret < 0)
-               goto out;
-       ret = 0;
-       ULIST_ITER_INIT(&uiter);
-       while ((root_node = ulist_next(roots, &uiter))) {
-               if (root_node->val == root->objectid) {
-                       ret = 1;
-                       break;
-               }
-       }
-       ulist_free(roots);
-out:
-       btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
-       return ret;
-}
-
  static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
                                      struct inode *inode,
                                      u64 endoff,
                                      const u64 destoff,
-                                    const u64 olen)
+                                    const u64 olen,
+                                    int no_time_update)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
  
         inode_inc_iversion(inode);
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       if (!no_time_update)
+               inode->i_mtime = inode->i_ctime = CURRENT_TIME;
         /*
          * We round up to the block size at eof when determining which
          * extents to clone above, but shouldn't round up the file size.
@@ -3166,6 +3369,150 @@ static void clone_update_extent_map(struct inode *inode,
                         &BTRFS_I(inode)->runtime_flags);
  }
  
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+                                   struct inode *dst,
+                                   struct btrfs_trans_handle *trans,
+                                   struct btrfs_path *path,
+                                   struct btrfs_key *new_key,
+                                   const u64 drop_start,
+                                   const u64 datal,
+                                   const u64 skip,
+                                   const u64 size,
+                                   char *inline_data)
+{
+       struct btrfs_root *root = BTRFS_I(dst)->root;
+       const u64 aligned_end = ALIGN(new_key->offset + datal,
+                                     root->sectorsize);
+       int ret;
+       struct btrfs_key key;
+
+       if (new_key->offset > 0)
+               return -EOPNOTSUPP;
+
+       key.objectid = btrfs_ino(dst);
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               return ret;
+       } else if (ret > 0) {
+               if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               goto copy_inline_extent;
+               }
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid == btrfs_ino(dst) &&
+                   key.type == BTRFS_EXTENT_DATA_KEY) {
+                       ASSERT(key.offset > 0);
+                       return -EOPNOTSUPP;
+               }
+       } else if (i_size_read(dst) <= datal) {
+               struct btrfs_file_extent_item *ei;
+               u64 ext_len;
+
+               /*
+                * If the file size is <= datal, make sure there are no other
+                * extents following (can happen do to an fallocate call with
+                * the flag FALLOC_FL_KEEP_SIZE).
+                */
+               ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                   struct btrfs_file_extent_item);
+               /*
+                * If it's an inline extent, it can not have other extents
+                * following it.
+                */
+               if (btrfs_file_extent_type(path->nodes[0], ei) ==
+                   BTRFS_FILE_EXTENT_INLINE)
+                       goto copy_inline_extent;
+
+               ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+               if (ext_len > aligned_end)
+                       return -EOPNOTSUPP;
+
+               ret = btrfs_next_item(root, path);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret == 0) {
+                       btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                             path->slots[0]);
+                       if (key.objectid == btrfs_ino(dst) &&
+                           key.type == BTRFS_EXTENT_DATA_KEY)
+                               return -EOPNOTSUPP;
+               }
+       }
+
+copy_inline_extent:
+       /*
+        * We have no extent items, or we have an extent at offset 0 which may
+        * or may not be inlined. All these cases are dealt the same way.
+        */
+       if (i_size_read(dst) > datal) {
+               /*
+                * If the destination inode has an inline extent...
+                * This would require copying the data from the source inline
+                * extent into the beginning of the destination's inline extent.
+                * But this is really complex, both extents can be compressed
+                * or just one of them, which would require decompressing and
+                * re-compressing data (which could increase the new compressed
+                * size, not allowing the compressed data to fit anymore in an
+                * inline extent).
+                * So just don't support this case for now (it should be rare,
+                * we are not really saving space when cloning inline extents).
+                */
+               return -EOPNOTSUPP;
+       }
+
+       btrfs_release_path(path);
+       ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+       if (ret)
+               return ret;
+       ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+       if (ret)
+               return ret;
+
+       if (skip) {
+               const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+               memmove(inline_data + start, inline_data + start + skip, datal);
+       }
+
+       write_extent_buffer(path->nodes[0], inline_data,
+                           btrfs_item_ptr_offset(path->nodes[0],
+                                                 path->slots[0]),
+                           size);
+       inode_add_bytes(dst, datal);
+
+       return 0;
+}
+
  /**
   * btrfs_clone() - clone a range from inode file to another
   *
@@ -3173,13 +3520,13 @@ static void clone_update_extent_map(struct inode *inode,
   * @inode: Inode to clone to
   * @off: Offset within source to start clone from
   * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- *               identical values here
+ * @olen_aligned: Block-aligned value of olen
   * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
   */
  static int btrfs_clone(struct inode *src, struct inode *inode,
                        const u64 off, const u64 olen, const u64 olen_aligned,
-                      const u64 destoff)
+                      const u64 destoff, int no_time_update)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_path *path = NULL;
@@ -3190,9 +3537,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
         u32 nritems;
         int slot;
         int ret;
-       int no_quota;
         const u64 len = olen_aligned;
-       u64 last_disko = 0;
         u64 last_dest_end = destoff;
  
         ret = -ENOMEM;
@@ -3238,7 +3583,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
  
                 nritems = btrfs_header_nritems(path->nodes[0]);
  process_slot:
-               no_quota = 1;
                 if (path->slots[0] >= nritems) {
                         ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
                         if (ret < 0)
@@ -3390,35 +3734,13 @@ process_slot:
                                 btrfs_set_file_extent_num_bytes(leaf, extent,
                                                                 datal);
  
-                               /*
-                                * We need to look up the roots that point at
-                                * this bytenr and see if the new root does.  If
-                                * it does not we need to make sure we update
-                                * quotas appropriately.
-                                */
-                               if (disko && root != BTRFS_I(src)->root &&
-                                   disko != last_disko) {
-                                       no_quota = check_ref(trans, root,
-                                                            disko);
-                                       if (no_quota < 0) {
-                                               btrfs_abort_transaction(trans,
-                                                                       root,
-                                                                       ret);
-                                               btrfs_end_transaction(trans,
-                                                                     root);
-                                               ret = no_quota;
-                                               goto out;
-                                       }
-                               }
-
                                 if (disko) {
                                         inode_add_bytes(inode, datal);
                                         ret = btrfs_inc_extent_ref(trans, root,
                                                         disko, diskl, 0,
                                                         root->root_key.objectid,
                                                         btrfs_ino(inode),
-                                                       new_key.offset - datao,
-                                                       no_quota);
+                                                       new_key.offset - datao);
                                         if (ret) {
                                                 btrfs_abort_transaction(trans,
                                                                         root,
@@ -3432,21 +3754,6 @@ process_slot:
                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                                 u64 skip = 0;
                                 u64 trim = 0;
-                               u64 aligned_end = 0;
-
-                               /*
-                                * Don't copy an inline extent into an offset
-                                * greater than zero. Having an inline extent
-                                * at such an offset results in chaos as btrfs
-                                * isn't prepared for such cases. Just skip
-                                * this case for the same reasons as commented
-                                * at btrfs_ioctl_clone().
-                                */
-                               if (last_dest_end > 0) {
-                                       ret = -EOPNOTSUPP;
-                                       btrfs_end_transaction(trans, root);
-                                       goto out;
-                               }
  
                                 if (off > key.offset) {
                                         skip = off - key.offset;
@@ -3464,42 +3771,22 @@ process_slot:
                                 size -= skip + trim;
                                 datal -= skip + trim;
  
-                               aligned_end = ALIGN(new_key.offset + datal,
-                                                   root->sectorsize);
-                               ret = btrfs_drop_extents(trans, root, inode,
-                                                        drop_start,
-                                                        aligned_end,
-                                                        1);
+                               ret = clone_copy_inline_extent(src, inode,
+                                                              trans, path,
+                                                              &new_key,
+                                                              drop_start,
+                                                              datal,
+                                                              skip, size, buf);
                                 if (ret) {
                                         if (ret != -EOPNOTSUPP)
                                                 btrfs_abort_transaction(trans,
-                                                       root, ret);
-                                       btrfs_end_transaction(trans, root);
-                                       goto out;
-                               }
-
-                               ret = btrfs_insert_empty_item(trans, root, path,
-                                                             &new_key, size);
-                               if (ret) {
-                                       btrfs_abort_transaction(trans, root,
-                                                               ret);
+                                                                       root,
+                                                                       ret);
                                         btrfs_end_transaction(trans, root);
                                         goto out;
                                 }
-
-                               if (skip) {
-                                       u32 start =
-                                         btrfs_file_extent_calc_inline_size(0);
-                                       memmove(buf+start, buf+start+skip,
-                                               datal);
-                               }
-
                                 leaf = path->nodes[0];
                                 slot = path->slots[0];
-                               write_extent_buffer(leaf, buf,
-                                           btrfs_item_ptr_offset(leaf, slot),
-                                           size);
-                               inode_add_bytes(inode, datal);
                         }
  
                         /* If we have an implicit hole (NO_HOLES feature). */
@@ -3517,7 +3804,8 @@ process_slot:
                                               root->sectorsize);
                         ret = clone_finish_inode_update(trans, inode,
                                                         last_dest_end,
-                                                       destoff, olen);
+                                                       destoff, olen,
+                                                       no_time_update);
                         if (ret)
                                 goto out;
                         if (new_key.offset + datal >= destoff + len)
@@ -3555,7 +3843,7 @@ process_slot:
                 clone_update_extent_map(inode, trans, NULL, last_dest_end,
                                         destoff + len - last_dest_end);
                 ret = clone_finish_inode_update(trans, inode, destoff + len,
-                                               destoff, olen);
+                                               destoff, olen, no_time_update);
         }
  
  out:
@@ -3632,13 +3920,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 goto out_fput;
  
         if (!same_inode) {
-               if (inode < src) {
-                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-                       mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-               } else {
-                       mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-               }
+               btrfs_double_inode_lock(src, inode);
         } else {
                 mutex_lock(&src->i_mutex);
         }
@@ -3686,13 +3968,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 u64 lock_start = min_t(u64, off, destoff);
                 u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
  
-               lock_extent_range(src, lock_start, lock_len);
+               ret = lock_extent_range(src, lock_start, lock_len, true);
         } else {
-               lock_extent_range(src, off, len);
-               lock_extent_range(inode, destoff, len);
+               ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+                                              true);
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               goto out_unlock;
         }
  
-       ret = btrfs_clone(src, inode, off, olen, len, destoff);
+       ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
  
         if (same_inode) {
                 u64 lock_start = min_t(u64, off, destoff);
@@ -3700,9 +3987,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
  
                 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
         } else {
-               unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-               unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
-                             destoff + len - 1);
+               btrfs_double_extent_unlock(src, off, inode, destoff, len);
         }
         /*
          * Truncate page cache pages so that future reads will see the cloned
@@ -3711,17 +3996,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         truncate_inode_pages_range(&inode->i_data, destoff,
                                    PAGE_CACHE_ALIGN(destoff + len) - 1);
  out_unlock:
-       if (!same_inode) {
-               if (inode < src) {
-                       mutex_unlock(&src->i_mutex);
-                       mutex_unlock(&inode->i_mutex);
-               } else {
-                       mutex_unlock(&inode->i_mutex);
-                       mutex_unlock(&src->i_mutex);
-               }
-       } else {
+       if (!same_inode)
+               btrfs_double_inode_unlock(src, inode);
+       else
                 mutex_unlock(&src->i_mutex);
-       }
  out_fput:
         fdput(src_file);
  out_drop_write:
@@ -4492,6 +4770,11 @@ locked:
                 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
         }
  
+       if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+               ret = -EINVAL;
+               goto out_bctl;
+       }
+
  do_balance:
         /*
          * Ownership of bctl and mutually_exclusive_operation_running
@@ -4503,12 +4786,15 @@ do_balance:
         need_unlock = false;
  
         ret = btrfs_balance(bctl, bargs);
+       bctl = NULL;
  
         if (arg) {
                 if (copy_to_user(arg, bargs, sizeof(*bargs)))
                         ret = -EFAULT;
         }
  
+out_bctl:
+       kfree(bctl);
  out_bargs:
         kfree(bargs);
  out_unlock:
@@ -4659,7 +4945,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
         /* update qgroup status and info */
         err = btrfs_run_qgroups(trans, root->fs_info);
         if (err < 0)
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "failed to update qgroup status and info\n");
         err = btrfs_end_transaction(trans, root);
         if (err && !ret)