These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / tree-log.c
index 4920fce..323e12c 100644 (file)
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_log_ctx *ctx)
 {
-       int index;
-       int ret;
+       int ret = 0;
 
        mutex_lock(&root->log_mutex);
+
        if (root->log_root) {
                if (btrfs_need_log_full_commit(root->fs_info, trans)) {
                        ret = -EAGAIN;
                        goto out;
                }
+
                if (!root->log_start_pid) {
-                       root->log_start_pid = current->pid;
                        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+                       root->log_start_pid = current->pid;
                } else if (root->log_start_pid != current->pid) {
                        set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                }
+       } else {
+               mutex_lock(&root->fs_info->tree_log_mutex);
+               if (!root->fs_info->log_root_tree)
+                       ret = btrfs_init_log_root_tree(trans, root->fs_info);
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               if (ret)
+                       goto out;
 
-               atomic_inc(&root->log_batch);
-               atomic_inc(&root->log_writers);
-               if (ctx) {
-                       index = root->log_transid % 2;
-                       list_add_tail(&ctx->list, &root->log_ctxs[index]);
-                       ctx->log_transid = root->log_transid;
-               }
-               mutex_unlock(&root->log_mutex);
-               return 0;
-       }
-
-       ret = 0;
-       mutex_lock(&root->fs_info->tree_log_mutex);
-       if (!root->fs_info->log_root_tree)
-               ret = btrfs_init_log_root_tree(trans, root->fs_info);
-       mutex_unlock(&root->fs_info->tree_log_mutex);
-       if (ret)
-               goto out;
-
-       if (!root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
                if (ret)
                        goto out;
+
+               clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+               root->log_start_pid = current->pid;
        }
-       clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-       root->log_start_pid = current->pid;
+
        atomic_inc(&root->log_batch);
        atomic_inc(&root->log_writers);
        if (ctx) {
-               index = root->log_transid % 2;
+               int index = root->log_transid % 2;
                list_add_tail(&ctx->list, &root->log_ctxs[index]);
                ctx->log_transid = root->log_transid;
        }
+
 out:
        mutex_unlock(&root->log_mutex);
        return ret;
@@ -238,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
 void btrfs_end_log_trans(struct btrfs_root *root)
 {
        if (atomic_dec_and_test(&root->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                if (waitqueue_active(&root->log_writer_wait))
                        wake_up(&root->log_writer_wait);
        }
@@ -700,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
-                                               key->objectid, offset, 0);
+                                               key->objectid, offset);
                                if (ret)
                                        goto out;
                        } else {
@@ -731,11 +724,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                &ordered_sums, 0);
                        if (ret)
                                goto out;
+                       /*
+                        * Now delete all existing cums in the csum root that
+                        * cover our range. We do this because we can have an
+                        * extent that is completely referenced by one file
+                        * extent item and partially referenced by another
+                        * file extent item (like after using the clone or
+                        * extent_same ioctls). In this case if we end up doing
+                        * the replay of the one that partially references the
+                        * extent first, and we do not do the csum deletion
+                        * below, we can get 2 csum items in the csum tree that
+                        * overlap each other. For example, imagine our log has
+                        * the two following file extent items:
+                        *
+                        * key (257 EXTENT_DATA 409600)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 20480 nr 20480 ram 102400
+                        *
+                        * key (257 EXTENT_DATA 819200)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 0 nr 102400 ram 102400
+                        *
+                        * Where the second one fully references the 100K extent
+                        * that starts at disk byte 12845056, and the log tree
+                        * has a single csum item that covers the entire range
+                        * of the extent:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        *
+                        * After the first file extent item is replayed, the
+                        * csum tree gets the following csum item:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which covers the 20K sub-range starting at offset 20K
+                        * of our extent. Now when we replay the second file
+                        * extent item, if we do not delete existing csum items
+                        * that cover any of its blocks, we end up getting two
+                        * csum items in our csum tree that overlap each other:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which is a problem, because after this anyone trying
+                        * to lookup up for the checksum of any block of our
+                        * extent starting at an offset of 40K or higher, will
+                        * end up looking at the second csum item only, which
+                        * does not contain the checksum for any block starting
+                        * at offset 40K or higher of our extent.
+                        */
                        while (!list_empty(&ordered_sums)) {
                                struct btrfs_ordered_sum *sums;
                                sums = list_entry(ordered_sums.next,
                                                struct btrfs_ordered_sum,
                                                list);
+                               if (!ret)
+                                       ret = btrfs_del_csums(trans,
+                                                     root->fs_info->csum_root,
+                                                     sums->bytenr,
+                                                     sums->len);
                                if (!ret)
                                        ret = btrfs_csum_file_blocks(trans,
                                                root->fs_info->csum_root,
@@ -1549,9 +1596,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  */
 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
-                                   struct btrfs_path *path,
                                    u64 dirid, u64 index,
-                                   char *name, int name_len, u8 type,
+                                   char *name, int name_len,
                                    struct btrfs_key *location)
 {
        struct inode *inode;
@@ -1613,6 +1659,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
  * not exist in the FS, it is skipped.  fsyncs on directories
  * do not force down inodes inside that directory, just changes to the
  * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
  */
 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
@@ -1631,6 +1680,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        int exists;
        int ret = 0;
        bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+       bool name_added = false;
 
        dir = read_one_inode(root, key->objectid);
        if (!dir)
@@ -1708,6 +1758,8 @@ out:
        }
        kfree(name);
        iput(dir);
+       if (!ret && name_added)
+               ret = 1;
        return ret;
 
 insert:
@@ -1719,10 +1771,12 @@ insert:
                goto out;
        }
        btrfs_release_path(path);
-       ret = insert_one_name(trans, root, path, key->objectid, key->offset,
-                             name, name_len, log_type, &log_key);
+       ret = insert_one_name(trans, root, key->objectid, key->offset,
+                             name, name_len, &log_key);
        if (ret && ret != -ENOENT && ret != -EEXIST)
                goto out;
+       if (!ret)
+               name_added = true;
        update_size = false;
        ret = 0;
        goto out;
@@ -1740,12 +1794,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                                        struct extent_buffer *eb, int slot,
                                        struct btrfs_key *key)
 {
-       int ret;
+       int ret = 0;
        u32 item_size = btrfs_item_size_nr(eb, slot);
        struct btrfs_dir_item *di;
        int name_len;
        unsigned long ptr;
        unsigned long ptr_end;
+       struct btrfs_path *fixup_path = NULL;
 
        ptr = btrfs_item_ptr_offset(eb, slot);
        ptr_end = ptr + item_size;
@@ -1755,12 +1810,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                        return -EIO;
                name_len = btrfs_dir_name_len(eb, di);
                ret = replay_one_name(trans, root, path, eb, di, key);
-               if (ret)
-                       return ret;
+               if (ret < 0)
+                       break;
                ptr = (unsigned long)(di + 1);
                ptr += name_len;
+
+               /*
+                * If this entry refers to a non-directory (directories can not
+                * have a link count > 1) and it was added in the transaction
+                * that was not committed, make sure we fixup the link count of
+                * the inode it the entry points to. Otherwise something like
+                * the following would result in a directory pointing to an
+                * inode with a wrong link that does not account for this dir
+                * entry:
+                *
+                * mkdir testdir
+                * touch testdir/foo
+                * touch testdir/bar
+                * sync
+                *
+                * ln testdir/bar testdir/bar_link
+                * ln testdir/foo testdir/foo_link
+                * xfs_io -c "fsync" testdir/bar
+                *
+                * <power failure>
+                *
+                * mount fs, log replay happens
+                *
+                * File foo would remain with a link count of 1 when it has two
+                * entries pointing to it in the directory testdir. This would
+                * make it impossible to ever delete the parent directory has
+                * it would result in stale dentries that can never be deleted.
+                */
+               if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+                       struct btrfs_key di_key;
+
+                       if (!fixup_path) {
+                               fixup_path = btrfs_alloc_path();
+                               if (!fixup_path) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                       }
+
+                       btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                       ret = link_to_fixup_dir(trans, root, fixup_path,
+                                               di_key.objectid);
+                       if (ret)
+                               break;
+               }
+               ret = 0;
        }
-       return 0;
+       btrfs_free_path(fixup_path);
+       return ret;
 }
 
 /*
@@ -2535,8 +2637,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
 {
        DEFINE_WAIT(wait);
        int index = transid % 2;
@@ -2561,8 +2662,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
                 atomic_read(&root->log_commit[index]));
 }
 
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
 
@@ -2642,7 +2742,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        index1 = log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
-               wait_log_commit(trans, root, log_transid);
+               wait_log_commit(root, log_transid);
                mutex_unlock(&root->log_mutex);
                return ctx->log_ret;
        }
@@ -2651,7 +2751,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-               wait_log_commit(trans, root, log_transid - 1);
+               wait_log_commit(root, log_transid - 1);
 
        while (1) {
                int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2762,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        schedule_timeout_uninterruptible(1);
                        mutex_lock(&root->log_mutex);
                }
-               wait_for_writer(trans, root);
+               wait_for_writer(root);
                if (batch == atomic_read(&root->log_batch))
                        break;
        }
@@ -2722,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        mutex_lock(&log_root_tree->log_mutex);
        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
-               smp_mb();
+               /*
+                * Implicit memory barrier after atomic_dec_and_test
+                */
                if (waitqueue_active(&log_root_tree->log_writer_wait))
                        wake_up(&log_root_tree->log_writer_wait);
        }
@@ -2759,7 +2861,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
                                                mark);
                btrfs_wait_logged_extents(trans, log, log_transid);
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                root_log_ctx.log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                if (!ret)
@@ -2770,11 +2872,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        atomic_set(&log_root_tree->log_commit[index2], 1);
 
        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                root_log_ctx.log_transid - 1);
        }
 
-       wait_for_writer(trans, log_root_tree);
+       wait_for_writer(log_root_tree);
 
        /*
         * now that we've moved on to the tree of log tree roots,
@@ -2852,6 +2954,9 @@ out_wake_log_root:
        atomic_set(&log_root_tree->log_commit[index2], 0);
        mutex_unlock(&log_root_tree->log_mutex);
 
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
                wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
@@ -2863,6 +2968,9 @@ out:
        atomic_set(&root->log_commit[index1], 0);
        mutex_unlock(&root->log_mutex);
 
+       /*
+        * The barrier before waitqueue_active is implied by mutex_unlock
+        */
        if (waitqueue_active(&root->log_commit_wait[index1]))
                wake_up(&root->log_commit_wait[index1]);
        return ret;
@@ -3881,12 +3989,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
                                     &ordered->flags))
                        continue;
 
-               if (ordered->csum_bytes_left) {
-                       btrfs_start_ordered_extent(inode, ordered, 0);
-                       wait_event(ordered->wait,
-                                  ordered->csum_bytes_left == 0);
-               }
-
                list_for_each_entry(sum, &ordered->list, list) {
                        ret = btrfs_csum_file_blocks(trans, log, sum);
                        if (ret)
@@ -4123,6 +4225,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
        return 0;
 }
 
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode,
+                               struct btrfs_path *path,
+                               struct btrfs_path *dst_path)
+{
+       int ret;
+       struct btrfs_key key;
+       const u64 ino = btrfs_ino(inode);
+       int ins_nr = 0;
+       int start_slot = 0;
+
+       key.objectid = ino;
+       key.type = BTRFS_XATTR_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       while (true) {
+               int slot = path->slots[0];
+               struct extent_buffer *leaf = path->nodes[0];
+               int nritems = btrfs_header_nritems(leaf);
+
+               if (slot >= nritems) {
+                       if (ins_nr > 0) {
+                               u64 last_extent = 0;
+
+                               ret = copy_items(trans, inode, dst_path, path,
+                                                &last_extent, start_slot,
+                                                ins_nr, 1, 0);
+                               /* can't be 1, extent items aren't processed */
+                               ASSERT(ret <= 0);
+                               if (ret < 0)
+                                       return ret;
+                               ins_nr = 0;
+                       }
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+                       break;
+
+               if (ins_nr == 0)
+                       start_slot = slot;
+               ins_nr++;
+               path->slots[0]++;
+               cond_resched();
+       }
+       if (ins_nr > 0) {
+               u64 last_extent = 0;
+
+               ret = copy_items(trans, inode, dst_path, path,
+                                &last_extent, start_slot,
+                                ins_nr, 1, 0);
+               /* can't be 1, extent items aren't processed */
+               ASSERT(ret <= 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ *      1) create file with 128Kb of data
+ *      2) truncate file to 64Kb
+ *      3) truncate file to 256Kb
+ *      4) fsync file
+ *      5) <crash/power failure>
+ *      6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct inode *inode,
+                                  struct btrfs_path *path)
+{
+       int ret;
+       struct btrfs_key key;
+       u64 hole_start;
+       u64 hole_size;
+       struct extent_buffer *leaf;
+       struct btrfs_root *log = root->log_root;
+       const u64 ino = btrfs_ino(inode);
+       const u64 i_size = i_size_read(inode);
+
+       if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+               return 0;
+
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       ASSERT(ret != 0);
+       if (ret < 0)
+               return ret;
+
+       ASSERT(path->slots[0] > 0);
+       path->slots[0]--;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+       if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* inode does not have any extents */
+               hole_start = 0;
+               hole_size = i_size;
+       } else {
+               struct btrfs_file_extent_item *extent;
+               u64 len;
+
+               /*
+                * If there's an extent beyond i_size, an explicit hole was
+                * already inserted by copy_items().
+                */
+               if (key.offset >= i_size)
+                       return 0;
+
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_file_extent_item);
+
+               if (btrfs_file_extent_type(leaf, extent) ==
+                   BTRFS_FILE_EXTENT_INLINE) {
+                       len = btrfs_file_extent_inline_len(leaf,
+                                                          path->slots[0],
+                                                          extent);
+                       ASSERT(len == i_size);
+                       return 0;
+               }
+
+               len = btrfs_file_extent_num_bytes(leaf, extent);
+               /* Last extent goes beyond i_size, no need to log a hole. */
+               if (key.offset + len > i_size)
+                       return 0;
+               hole_start = key.offset + len;
+               hole_size = i_size - hole_start;
+       }
+       btrfs_release_path(path);
+
+       /* Last extent ends at i_size. */
+       if (hole_size == 0)
+               return 0;
+
+       hole_size = ALIGN(hole_size, root->sectorsize);
+       ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+                                      hole_size, 0, hole_size, 0, 0, 0);
+       return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -4295,6 +4578,25 @@ again:
                if (min_key.type == BTRFS_INODE_ITEM_KEY)
                        need_log_inode_item = false;
 
+               /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+               if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+                       if (ins_nr == 0)
+                               goto next_slot;
+                       ret = copy_items(trans, inode, dst_path, path,
+                                        &last_extent, ins_start_slot,
+                                        ins_nr, inode_only, logged_isize);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_unlock;
+                       }
+                       ins_nr = 0;
+                       if (ret) {
+                               btrfs_release_path(path);
+                               continue;
+                       }
+                       goto next_slot;
+               }
+
                src = path->nodes[0];
                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
                        ins_nr++;
@@ -4362,6 +4664,18 @@ next_slot:
                ins_nr = 0;
        }
 
+       btrfs_release_path(path);
+       btrfs_release_path(dst_path);
+       err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+       if (err)
+               goto out_unlock;
+       if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+               btrfs_release_path(path);
+               btrfs_release_path(dst_path);
+               err = btrfs_log_trailing_hole(trans, root, inode, path);
+               if (err)
+                       goto out_unlock;
+       }
 log_extents:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
@@ -4698,6 +5012,94 @@ next_dir_inode:
        return ret;
 }
 
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+                                struct inode *inode,
+                                struct btrfs_log_ctx *ctx)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       const u64 ino = btrfs_ino(inode);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_REF_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               u32 cur_offset = 0;
+               u32 item_size;
+               unsigned long ptr;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+               if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+                       break;
+
+               item_size = btrfs_item_size_nr(leaf, slot);
+               ptr = btrfs_item_ptr_offset(leaf, slot);
+               while (cur_offset < item_size) {
+                       struct btrfs_key inode_key;
+                       struct inode *dir_inode;
+
+                       inode_key.type = BTRFS_INODE_ITEM_KEY;
+                       inode_key.offset = 0;
+
+                       if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                               struct btrfs_inode_extref *extref;
+
+                               extref = (struct btrfs_inode_extref *)
+                                       (ptr + cur_offset);
+                               inode_key.objectid = btrfs_inode_extref_parent(
+                                       leaf, extref);
+                               cur_offset += sizeof(*extref);
+                               cur_offset += btrfs_inode_extref_name_len(leaf,
+                                       extref);
+                       } else {
+                               inode_key.objectid = key.offset;
+                               cur_offset = item_size;
+                       }
+
+                       dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+                                              root, NULL);
+                       /* If parent inode was deleted, skip it. */
+                       if (IS_ERR(dir_inode))
+                               continue;
+
+                       ret = btrfs_log_inode(trans, root, dir_inode,
+                                             LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+                       iput(dir_inode);
+                       if (ret)
+                               goto out;
+               }
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -4717,9 +5119,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
-       const struct dentry * const first_parent = parent;
-       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
-                                last_committed);
        bool log_dentries = false;
        struct inode *orig_inode = inode;
 
@@ -4780,6 +5179,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
                log_dentries = true;
 
+       /*
+        * On unlink we must make sure all our current and old parent directores
+        * inodes are fully logged. This is to prevent leaving dangling
+        * directory index entries in directories that were our parents but are
+        * not anymore. Not doing this results in old parent directory being
+        * impossible to delete after log replay (rmdir will always fail with
+        * error -ENOTEMPTY).
+        *
+        * Example 1:
+        *
+        * mkdir testdir
+        * touch testdir/foo
+        * ln testdir/foo testdir/bar
+        * sync
+        * unlink testdir/bar
+        * xfs_io -c fsync testdir/foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * If we don't log the parent directory (testdir), after log replay the
+        * directory still has an entry pointing to the file inode using the bar
+        * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+        * the file inode has a link count of 1.
+        *
+        * Example 2:
+        *
+        * mkdir testdir
+        * touch foo
+        * ln foo testdir/foo2
+        * ln foo testdir/foo3
+        * sync
+        * unlink testdir/foo3
+        * xfs_io -c fsync foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * Similar as the first example, after log replay the parent directory
+        * testdir still has an entry pointing to the inode file with name foo3
+        * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+        * and has a link count of 2.
+        */
+       if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+               ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+               if (ret)
+                       goto end_trans;
+       }
+
        while (1) {
                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
                        break;
@@ -4788,23 +5234,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (root != BTRFS_I(inode)->root)
                        break;
 
-               /*
-                * On unlink we must make sure our immediate parent directory
-                * inode is fully logged. This is to prevent leaving dangling
-                * directory index entries and a wrong directory inode's i_size.
-                * Not doing so can result in a directory being impossible to
-                * delete after log replay (rmdir will always fail with error
-                * -ENOTEMPTY).
-                */
-               if (did_unlink && parent == first_parent)
-                       inode_only = LOG_INODE_ALL;
-               else
-                       inode_only = LOG_INODE_EXISTS;
-
-               if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed ||
-                   inode_only == LOG_INODE_ALL) {
-                       ret = btrfs_log_inode(trans, root, inode, inode_only,
+               if (BTRFS_I(inode)->generation > last_committed) {
+                       ret = btrfs_log_inode(trans, root, inode,
+                                             LOG_INODE_EXISTS,
                                              0, LLONG_MAX, ctx);
                        if (ret)
                                goto end_trans;
@@ -4892,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
        ret = walk_log_tree(trans, log_root_tree, &wc);
        if (ret) {
-               btrfs_error(fs_info, ret, "Failed to pin buffers while "
+               btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
                            "recovering log root tree.");
                goto error;
        }
@@ -4906,7 +5338,7 @@ again:
                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
                if (ret < 0) {
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                    "Couldn't find tree log root.");
                        goto error;
                }
@@ -4924,7 +5356,7 @@ again:
                log = btrfs_read_fs_root(log_root_tree, &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
-                       btrfs_error(fs_info, ret,
+                       btrfs_std_error(fs_info, ret,
                                    "Couldn't read tree log root.");
                        goto error;
                }
@@ -4939,7 +5371,7 @@ again:
                        free_extent_buffer(log->node);
                        free_extent_buffer(log->commit_root);
                        kfree(log);
-                       btrfs_error(fs_info, ret, "Couldn't read target root "
+                       btrfs_std_error(fs_info, ret, "Couldn't read target root "
                                    "for tree log recovery.");
                        goto error;
                }