Upgrade to 4.4.50-rt62

[kvmfornfv.git] / kernel / fs / btrfs / tree-log.c
diff --git a/kernel/fs/btrfs/tree-log.c b/kernel/fs/btrfs/tree-log.c

index d049683..ee7832e 100644 (file)
--- a/kernel/fs/btrfs/tree-log.c
+++ b/kernel/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_log_ctx *ctx)
  {
-       int index;
-       int ret;
+       int ret = 0;
  
         mutex_lock(&root->log_mutex);
+
         if (root->log_root) {
                 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
                         ret = -EAGAIN;
                         goto out;
                 }
+
                 if (!root->log_start_pid) {
-                       root->log_start_pid = current->pid;
                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+                       root->log_start_pid = current->pid;
                 } else if (root->log_start_pid != current->pid) {
                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                 }
+       } else {
+               mutex_lock(&root->fs_info->tree_log_mutex);
+               if (!root->fs_info->log_root_tree)
+                       ret = btrfs_init_log_root_tree(trans, root->fs_info);
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               if (ret)
+                       goto out;
  
-               atomic_inc(&root->log_batch);
-               atomic_inc(&root->log_writers);
-               if (ctx) {
-                       index = root->log_transid % 2;
-                       list_add_tail(&ctx->list, &root->log_ctxs[index]);
-                       ctx->log_transid = root->log_transid;
-               }
-               mutex_unlock(&root->log_mutex);
-               return 0;
-       }
-
-       ret = 0;
-       mutex_lock(&root->fs_info->tree_log_mutex);
-       if (!root->fs_info->log_root_tree)
-               ret = btrfs_init_log_root_tree(trans, root->fs_info);
-       mutex_unlock(&root->fs_info->tree_log_mutex);
-       if (ret)
-               goto out;
-
-       if (!root->log_root) {
                 ret = btrfs_add_log_tree(trans, root);
                 if (ret)
                         goto out;
+
+               clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+               root->log_start_pid = current->pid;
         }
-       clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-       root->log_start_pid = current->pid;
+
         atomic_inc(&root->log_batch);
         atomic_inc(&root->log_writers);
         if (ctx) {
-               index = root->log_transid % 2;
+               int index = root->log_transid % 2;
                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
                 ctx->log_transid = root->log_transid;
         }
+
  out:
         mutex_unlock(&root->log_mutex);
         return ret;
@@ -238,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
  void btrfs_end_log_trans(struct btrfs_root *root)
  {
         if (atomic_dec_and_test(&root->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                 if (waitqueue_active(&root->log_writer_wait))
                         wake_up(&root->log_writer_wait);
         }
@@ -700,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                 ret = btrfs_inc_extent_ref(trans, root,
                                                 ins.objectid, ins.offset,
                                                 0, root->root_key.objectid,
-                                               key->objectid, offset, 0);
+                                               key->objectid, offset);
                                 if (ret)
                                         goto out;
                         } else {
@@ -731,11 +724,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                 &ordered_sums, 0);
                         if (ret)
                                 goto out;
+                       /*
+                        * Now delete all existing cums in the csum root that
+                        * cover our range. We do this because we can have an
+                        * extent that is completely referenced by one file
+                        * extent item and partially referenced by another
+                        * file extent item (like after using the clone or
+                        * extent_same ioctls). In this case if we end up doing
+                        * the replay of the one that partially references the
+                        * extent first, and we do not do the csum deletion
+                        * below, we can get 2 csum items in the csum tree that
+                        * overlap each other. For example, imagine our log has
+                        * the two following file extent items:
+                        *
+                        * key (257 EXTENT_DATA 409600)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 20480 nr 20480 ram 102400
+                        *
+                        * key (257 EXTENT_DATA 819200)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 0 nr 102400 ram 102400
+                        *
+                        * Where the second one fully references the 100K extent
+                        * that starts at disk byte 12845056, and the log tree
+                        * has a single csum item that covers the entire range
+                        * of the extent:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        *
+                        * After the first file extent item is replayed, the
+                        * csum tree gets the following csum item:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which covers the 20K sub-range starting at offset 20K
+                        * of our extent. Now when we replay the second file
+                        * extent item, if we do not delete existing csum items
+                        * that cover any of its blocks, we end up getting two
+                        * csum items in our csum tree that overlap each other:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which is a problem, because after this anyone trying
+                        * to lookup up for the checksum of any block of our
+                        * extent starting at an offset of 40K or higher, will
+                        * end up looking at the second csum item only, which
+                        * does not contain the checksum for any block starting
+                        * at offset 40K or higher of our extent.
+                        */
                         while (!list_empty(&ordered_sums)) {
                                 struct btrfs_ordered_sum *sums;
                                 sums = list_entry(ordered_sums.next,
                                                 struct btrfs_ordered_sum,
                                                 list);
+                               if (!ret)
+                                       ret = btrfs_del_csums(trans,
+                                                     root->fs_info->csum_root,
+                                                     sums->bytenr,
+                                                     sums->len);
                                 if (!ret)
                                         ret = btrfs_csum_file_blocks(trans,
                                                 root->fs_info->csum_root,
@@ -1549,9 +1596,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
   */
  static noinline int insert_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
-                                   struct btrfs_path *path,
                                     u64 dirid, u64 index,
-                                   char *name, int name_len, u8 type,
+                                   char *name, int name_len,
                                     struct btrfs_key *location)
  {
         struct inode *inode;
@@ -1613,6 +1659,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
   * not exist in the FS, it is skipped.  fsyncs on directories
   * do not force down inodes inside that directory, just changes to the
   * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
   */
  static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
@@ -1631,6 +1680,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
         int exists;
         int ret = 0;
         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+       bool name_added = false;
  
         dir = read_one_inode(root, key->objectid);
         if (!dir)
@@ -1708,6 +1758,8 @@ out:
         }
         kfree(name);
         iput(dir);
+       if (!ret && name_added)
+               ret = 1;
         return ret;
  
  insert:
@@ -1719,10 +1771,12 @@ insert:
                 goto out;
         }
         btrfs_release_path(path);
-       ret = insert_one_name(trans, root, path, key->objectid, key->offset,
-                             name, name_len, log_type, &log_key);
+       ret = insert_one_name(trans, root, key->objectid, key->offset,
+                             name, name_len, &log_key);
         if (ret && ret != -ENOENT && ret != -EEXIST)
                 goto out;
+       if (!ret)
+               name_added = true;
         update_size = false;
         ret = 0;
         goto out;
@@ -1740,12 +1794,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                                         struct extent_buffer *eb, int slot,
                                         struct btrfs_key *key)
  {
-       int ret;
+       int ret = 0;
         u32 item_size = btrfs_item_size_nr(eb, slot);
         struct btrfs_dir_item *di;
         int name_len;
         unsigned long ptr;
         unsigned long ptr_end;
+       struct btrfs_path *fixup_path = NULL;
  
         ptr = btrfs_item_ptr_offset(eb, slot);
         ptr_end = ptr + item_size;
@@ -1755,12 +1810,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                         return -EIO;
                 name_len = btrfs_dir_name_len(eb, di);
                 ret = replay_one_name(trans, root, path, eb, di, key);
-               if (ret)
-                       return ret;
+               if (ret < 0)
+                       break;
                 ptr = (unsigned long)(di + 1);
                 ptr += name_len;
+
+               /*
+                * If this entry refers to a non-directory (directories can not
+                * have a link count > 1) and it was added in the transaction
+                * that was not committed, make sure we fixup the link count of
+                * the inode it the entry points to. Otherwise something like
+                * the following would result in a directory pointing to an
+                * inode with a wrong link that does not account for this dir
+                * entry:
+                *
+                * mkdir testdir
+                * touch testdir/foo
+                * touch testdir/bar
+                * sync
+                *
+                * ln testdir/bar testdir/bar_link
+                * ln testdir/foo testdir/foo_link
+                * xfs_io -c "fsync" testdir/bar
+                *
+                * <power failure>
+                *
+                * mount fs, log replay happens
+                *
+                * File foo would remain with a link count of 1 when it has two
+                * entries pointing to it in the directory testdir. This would
+                * make it impossible to ever delete the parent directory has
+                * it would result in stale dentries that can never be deleted.
+                */
+               if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+                       struct btrfs_key di_key;
+
+                       if (!fixup_path) {
+                               fixup_path = btrfs_alloc_path();
+                               if (!fixup_path) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                       }
+
+                       btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                       ret = link_to_fixup_dir(trans, root, fixup_path,
+                                               di_key.objectid);
+                       if (ret)
+                               break;
+               }
+               ret = 0;
         }
-       return 0;
+       btrfs_free_path(fixup_path);
+       return ret;
  }
  
  /*
@@ -1821,12 +1923,11 @@ static noinline int find_dir_range(struct btrfs_root *root,
  next:
         /* check the next slot in the tree to see if it is a valid item */
         nritems = btrfs_header_nritems(path->nodes[0]);
+       path->slots[0]++;
         if (path->slots[0] >= nritems) {
                 ret = btrfs_next_leaf(root, path);
                 if (ret)
                         goto out;
-       } else {
-               path->slots[0]++;
         }
  
         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -2535,8 +2636,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
  {
         DEFINE_WAIT(wait);
         int index = transid % 2;
@@ -2561,8 +2661,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
                  atomic_read(&root->log_commit[index]));
  }
  
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
  {
         DEFINE_WAIT(wait);
  
@@ -2596,14 +2695,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
                                              int index, int error)
  {
         struct btrfs_log_ctx *ctx;
+       struct btrfs_log_ctx *safe;
  
-       if (!error) {
-               INIT_LIST_HEAD(&root->log_ctxs[index]);
-               return;
-       }
-
-       list_for_each_entry(ctx, &root->log_ctxs[index], list)
+       list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
+               list_del_init(&ctx->list);
                 ctx->log_ret = error;
+       }
  
         INIT_LIST_HEAD(&root->log_ctxs[index]);
  }
@@ -2642,7 +2739,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         index1 = log_transid % 2;
         if (atomic_read(&root->log_commit[index1])) {
-               wait_log_commit(trans, root, log_transid);
+               wait_log_commit(root, log_transid);
                 mutex_unlock(&root->log_mutex);
                 return ctx->log_ret;
         }
@@ -2651,7 +2748,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         /* wait for previous tree log sync to complete */
         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-               wait_log_commit(trans, root, log_transid - 1);
+               wait_log_commit(root, log_transid - 1);
  
         while (1) {
                 int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2759,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                         schedule_timeout_uninterruptible(1);
                         mutex_lock(&root->log_mutex);
                 }
-               wait_for_writer(trans, root);
+               wait_for_writer(root);
                 if (batch == atomic_read(&root->log_batch))
                         break;
         }
@@ -2722,7 +2819,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         mutex_lock(&log_root_tree->log_mutex);
         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                 if (waitqueue_active(&log_root_tree->log_writer_wait))
                         wake_up(&log_root_tree->log_writer_wait);
         }
@@ -2748,6 +2847,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
                 blk_finish_plug(&plug);
+               list_del_init(&root_log_ctx.list);
                 mutex_unlock(&log_root_tree->log_mutex);
                 ret = root_log_ctx.log_ret;
                 goto out;
@@ -2759,7 +2859,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
                                                 mark);
                 btrfs_wait_logged_extents(trans, log, log_transid);
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
                 if (!ret)
@@ -2770,11 +2870,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         atomic_set(&log_root_tree->log_commit[index2], 1);
  
         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid - 1);
         }
  
-       wait_for_writer(trans, log_root_tree);
+       wait_for_writer(log_root_tree);
  
         /*
          * now that we've moved on to the tree of log tree roots,
@@ -2841,28 +2941,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         mutex_unlock(&root->log_mutex);
  
  out_wake_log_root:
-       /*
-        * We needn't get log_mutex here because we are sure all
-        * the other tasks are blocked.
-        */
+       mutex_lock(&log_root_tree->log_mutex);
         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
  
-       mutex_lock(&log_root_tree->log_mutex);
         log_root_tree->log_transid_committed++;
         atomic_set(&log_root_tree->log_commit[index2], 0);
         mutex_unlock(&log_root_tree->log_mutex);
  
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
                 wake_up(&log_root_tree->log_commit_wait[index2]);
  out:
-       /* See above. */
-       btrfs_remove_all_log_ctxs(root, index1, ret);
-
         mutex_lock(&root->log_mutex);
+       btrfs_remove_all_log_ctxs(root, index1, ret);
         root->log_transid_committed++;
         atomic_set(&root->log_commit[index1], 0);
         mutex_unlock(&root->log_mutex);
  
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
         if (waitqueue_active(&root->log_commit_wait[index1]))
                 wake_up(&root->log_commit_wait[index1]);
         return ret;
@@ -3881,12 +3981,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
                                      &ordered->flags))
                         continue;
  
-               if (ordered->csum_bytes_left) {
-                       btrfs_start_ordered_extent(inode, ordered, 0);
-                       wait_event(ordered->wait,
-                                  ordered->csum_bytes_left == 0);
-               }
-
                 list_for_each_entry(sum, &ordered->list, list) {
                         ret = btrfs_csum_file_blocks(trans, log, sum);
                         if (ret)
@@ -4123,6 +4217,308 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
         return 0;
  }
  
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode,
+                               struct btrfs_path *path,
+                               struct btrfs_path *dst_path)
+{
+       int ret;
+       struct btrfs_key key;
+       const u64 ino = btrfs_ino(inode);
+       int ins_nr = 0;
+       int start_slot = 0;
+
+       key.objectid = ino;
+       key.type = BTRFS_XATTR_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       while (true) {
+               int slot = path->slots[0];
+               struct extent_buffer *leaf = path->nodes[0];
+               int nritems = btrfs_header_nritems(leaf);
+
+               if (slot >= nritems) {
+                       if (ins_nr > 0) {
+                               u64 last_extent = 0;
+
+                               ret = copy_items(trans, inode, dst_path, path,
+                                                &last_extent, start_slot,
+                                                ins_nr, 1, 0);
+                               /* can't be 1, extent items aren't processed */
+                               ASSERT(ret <= 0);
+                               if (ret < 0)
+                                       return ret;
+                               ins_nr = 0;
+                       }
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+                       break;
+
+               if (ins_nr == 0)
+                       start_slot = slot;
+               ins_nr++;
+               path->slots[0]++;
+               cond_resched();
+       }
+       if (ins_nr > 0) {
+               u64 last_extent = 0;
+
+               ret = copy_items(trans, inode, dst_path, path,
+                                &last_extent, start_slot,
+                                ins_nr, 1, 0);
+               /* can't be 1, extent items aren't processed */
+               ASSERT(ret <= 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ *      1) create file with 128Kb of data
+ *      2) truncate file to 64Kb
+ *      3) truncate file to 256Kb
+ *      4) fsync file
+ *      5) <crash/power failure>
+ *      6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct inode *inode,
+                                  struct btrfs_path *path)
+{
+       int ret;
+       struct btrfs_key key;
+       u64 hole_start;
+       u64 hole_size;
+       struct extent_buffer *leaf;
+       struct btrfs_root *log = root->log_root;
+       const u64 ino = btrfs_ino(inode);
+       const u64 i_size = i_size_read(inode);
+
+       if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+               return 0;
+
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       ASSERT(ret != 0);
+       if (ret < 0)
+               return ret;
+
+       ASSERT(path->slots[0] > 0);
+       path->slots[0]--;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+       if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* inode does not have any extents */
+               hole_start = 0;
+               hole_size = i_size;
+       } else {
+               struct btrfs_file_extent_item *extent;
+               u64 len;
+
+               /*
+                * If there's an extent beyond i_size, an explicit hole was
+                * already inserted by copy_items().
+                */
+               if (key.offset >= i_size)
+                       return 0;
+
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_file_extent_item);
+
+               if (btrfs_file_extent_type(leaf, extent) ==
+                   BTRFS_FILE_EXTENT_INLINE) {
+                       len = btrfs_file_extent_inline_len(leaf,
+                                                          path->slots[0],
+                                                          extent);
+                       ASSERT(len == i_size);
+                       return 0;
+               }
+
+               len = btrfs_file_extent_num_bytes(leaf, extent);
+               /* Last extent goes beyond i_size, no need to log a hole. */
+               if (key.offset + len > i_size)
+                       return 0;
+               hole_start = key.offset + len;
+               hole_size = i_size - hole_start;
+       }
+       btrfs_release_path(path);
+
+       /* Last extent ends at i_size. */
+       if (hole_size == 0)
+               return 0;
+
+       hole_size = ALIGN(hole_size, root->sectorsize);
+       ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+                                      hole_size, 0, hole_size, 0, 0, 0);
+       return ret;
+}
+
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x                 # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+                                        const int slot,
+                                        const struct btrfs_key *key,
+                                        struct inode *inode)
+{
+       int ret;
+       struct btrfs_path *search_path;
+       char *name = NULL;
+       u32 name_len = 0;
+       u32 item_size = btrfs_item_size_nr(eb, slot);
+       u32 cur_offset = 0;
+       unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+       search_path = btrfs_alloc_path();
+       if (!search_path)
+               return -ENOMEM;
+       search_path->search_commit_root = 1;
+       search_path->skip_locking = 1;
+
+       while (cur_offset < item_size) {
+               u64 parent;
+               u32 this_name_len;
+               u32 this_len;
+               unsigned long name_ptr;
+               struct btrfs_dir_item *di;
+
+               if (key->type == BTRFS_INODE_REF_KEY) {
+                       struct btrfs_inode_ref *iref;
+
+                       iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+                       parent = key->offset;
+                       this_name_len = btrfs_inode_ref_name_len(eb, iref);
+                       name_ptr = (unsigned long)(iref + 1);
+                       this_len = sizeof(*iref) + this_name_len;
+               } else {
+                       struct btrfs_inode_extref *extref;
+
+                       extref = (struct btrfs_inode_extref *)(ptr +
+                                                              cur_offset);
+                       parent = btrfs_inode_extref_parent(eb, extref);
+                       this_name_len = btrfs_inode_extref_name_len(eb, extref);
+                       name_ptr = (unsigned long)&extref->name;
+                       this_len = sizeof(*extref) + this_name_len;
+               }
+
+               if (this_name_len > name_len) {
+                       char *new_name;
+
+                       new_name = krealloc(name, this_name_len, GFP_NOFS);
+                       if (!new_name) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       name_len = this_name_len;
+                       name = new_name;
+               }
+
+               read_extent_buffer(eb, name, name_ptr, this_name_len);
+               di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+                                          search_path, parent,
+                                          name, this_name_len, 0);
+               if (di && !IS_ERR(di)) {
+                       ret = 1;
+                       goto out;
+               } else if (IS_ERR(di)) {
+                       ret = PTR_ERR(di);
+                       goto out;
+               }
+               btrfs_release_path(search_path);
+
+               cur_offset += this_len;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(search_path);
+       kfree(name);
+       return ret;
+}
+
  /* log a single inode in the tree log.
   * At least one parent directory for this inode must exist in the tree
   * or be logged already.
@@ -4161,6 +4557,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         u64 ino = btrfs_ino(inode);
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         u64 logged_isize = 0;
+       bool need_log_inode_item = true;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -4269,11 +4666,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 } else {
                         if (inode_only == LOG_INODE_ALL)
                                 fast_search = true;
-                       ret = log_inode_item(trans, log, dst_path, inode);
-                       if (ret) {
-                               err = ret;
-                               goto out_unlock;
-                       }
                         goto log_extents;
                 }
  
@@ -4296,6 +4688,44 @@ again:
                 if (min_key.type > max_key.type)
                         break;
  
+               if (min_key.type == BTRFS_INODE_ITEM_KEY)
+                       need_log_inode_item = false;
+
+               if ((min_key.type == BTRFS_INODE_REF_KEY ||
+                    min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+                   BTRFS_I(inode)->generation == trans->transid) {
+                       ret = btrfs_check_ref_name_override(path->nodes[0],
+                                                           path->slots[0],
+                                                           &min_key, inode);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_unlock;
+                       } else if (ret > 0) {
+                               err = 1;
+                               btrfs_set_log_full_commit(root->fs_info, trans);
+                               goto out_unlock;
+                       }
+               }
+
+               /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+               if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+                       if (ins_nr == 0)
+                               goto next_slot;
+                       ret = copy_items(trans, inode, dst_path, path,
+                                        &last_extent, ins_start_slot,
+                                        ins_nr, inode_only, logged_isize);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_unlock;
+                       }
+                       ins_nr = 0;
+                       if (ret) {
+                               btrfs_release_path(path);
+                               continue;
+                       }
+                       goto next_slot;
+               }
+
                 src = path->nodes[0];
                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
                         ins_nr++;
@@ -4363,9 +4793,26 @@ next_slot:
                 ins_nr = 0;
         }
  
+       btrfs_release_path(path);
+       btrfs_release_path(dst_path);
+       err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+       if (err)
+               goto out_unlock;
+       if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+               btrfs_release_path(path);
+               btrfs_release_path(dst_path);
+               err = btrfs_log_trailing_hole(trans, root, inode, path);
+               if (err)
+                       goto out_unlock;
+       }
  log_extents:
         btrfs_release_path(path);
         btrfs_release_path(dst_path);
+       if (need_log_inode_item) {
+               err = log_inode_item(trans, log, dst_path, inode);
+               if (err)
+                       goto out_unlock;
+       }
         if (fast_search) {
                 /*
                  * Some ordered extents started by fsync might have completed
@@ -4694,6 +5141,94 @@ next_dir_inode:
         return ret;
  }
  
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+                                struct inode *inode,
+                                struct btrfs_log_ctx *ctx)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       const u64 ino = btrfs_ino(inode);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_REF_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               u32 cur_offset = 0;
+               u32 item_size;
+               unsigned long ptr;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+               if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+                       break;
+
+               item_size = btrfs_item_size_nr(leaf, slot);
+               ptr = btrfs_item_ptr_offset(leaf, slot);
+               while (cur_offset < item_size) {
+                       struct btrfs_key inode_key;
+                       struct inode *dir_inode;
+
+                       inode_key.type = BTRFS_INODE_ITEM_KEY;
+                       inode_key.offset = 0;
+
+                       if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                               struct btrfs_inode_extref *extref;
+
+                               extref = (struct btrfs_inode_extref *)
+                                       (ptr + cur_offset);
+                               inode_key.objectid = btrfs_inode_extref_parent(
+                                       leaf, extref);
+                               cur_offset += sizeof(*extref);
+                               cur_offset += btrfs_inode_extref_name_len(leaf,
+                                       extref);
+                       } else {
+                               inode_key.objectid = key.offset;
+                               cur_offset = item_size;
+                       }
+
+                       dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+                                              root, NULL);
+                       /* If parent inode was deleted, skip it. */
+                       if (IS_ERR(dir_inode))
+                               continue;
+
+                       ret = btrfs_log_inode(trans, root, dir_inode,
+                                             LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+                       iput(dir_inode);
+                       if (ret)
+                               goto out;
+               }
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
  /*
   * helper function around btrfs_log_inode to make sure newly created
   * parent directories also end up in the log.  A minimal inode and backref
@@ -4713,9 +5248,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         struct dentry *old_parent = NULL;
         int ret = 0;
         u64 last_committed = root->fs_info->last_trans_committed;
-       const struct dentry * const first_parent = parent;
-       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
-                                last_committed);
         bool log_dentries = false;
         struct inode *orig_inode = inode;
  
@@ -4776,6 +5308,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
                 log_dentries = true;
  
+       /*
+        * On unlink we must make sure all our current and old parent directores
+        * inodes are fully logged. This is to prevent leaving dangling
+        * directory index entries in directories that were our parents but are
+        * not anymore. Not doing this results in old parent directory being
+        * impossible to delete after log replay (rmdir will always fail with
+        * error -ENOTEMPTY).
+        *
+        * Example 1:
+        *
+        * mkdir testdir
+        * touch testdir/foo
+        * ln testdir/foo testdir/bar
+        * sync
+        * unlink testdir/bar
+        * xfs_io -c fsync testdir/foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * If we don't log the parent directory (testdir), after log replay the
+        * directory still has an entry pointing to the file inode using the bar
+        * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+        * the file inode has a link count of 1.
+        *
+        * Example 2:
+        *
+        * mkdir testdir
+        * touch foo
+        * ln foo testdir/foo2
+        * ln foo testdir/foo3
+        * sync
+        * unlink testdir/foo3
+        * xfs_io -c fsync foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * Similar as the first example, after log replay the parent directory
+        * testdir still has an entry pointing to the inode file with name foo3
+        * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+        * and has a link count of 2.
+        */
+       if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+               ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+               if (ret)
+                       goto end_trans;
+       }
+
         while (1) {
                 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
                         break;
@@ -4784,23 +5363,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 if (root != BTRFS_I(inode)->root)
                         break;
  
-               /*
-                * On unlink we must make sure our immediate parent directory
-                * inode is fully logged. This is to prevent leaving dangling
-                * directory index entries and a wrong directory inode's i_size.
-                * Not doing so can result in a directory being impossible to
-                * delete after log replay (rmdir will always fail with error
-                * -ENOTEMPTY).
-                */
-               if (did_unlink && parent == first_parent)
-                       inode_only = LOG_INODE_ALL;
-               else
-                       inode_only = LOG_INODE_EXISTS;
-
-               if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed ||
-                   inode_only == LOG_INODE_ALL) {
-                       ret = btrfs_log_inode(trans, root, inode, inode_only,
+               if (BTRFS_I(inode)->generation > last_committed) {
+                       ret = btrfs_log_inode(trans, root, inode,
+                                             LOG_INODE_EXISTS,
                                               0, LLONG_MAX, ctx);
                         if (ret)
                                 goto end_trans;
@@ -4888,7 +5453,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
  
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@ -4902,7 +5467,7 @@ again:
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
  
                 if (ret < 0) {
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@ -4920,7 +5485,7 @@ again:
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@ -4935,7 +5500,7 @@ again:
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
-                       btrfs_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }