X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=kernel%2Ffs%2Fbtrfs%2Ftree-log.c;h=323e12cc9d2f522388fe929ed66d4dd946b5881d;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hp=d04968374e9d8bd5b118024d53ea0a5bfa6d0bd2;hpb=9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00;p=kvmfornfv.git diff --git a/kernel/fs/btrfs/tree-log.c b/kernel/fs/btrfs/tree-log.c index d04968374..323e12cc9 100644 --- a/kernel/fs/btrfs/tree-log.c +++ b/kernel/fs/btrfs/tree-log.c @@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - int index; - int ret; + int ret = 0; mutex_lock(&root->log_mutex); + if (root->log_root) { if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } + if (!root->log_start_pid) { - root->log_start_pid = current->pid; clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } else if (root->log_start_pid != current->pid) { set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } + } else { + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, root->fs_info); + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; - atomic_inc(&root->log_batch); - atomic_inc(&root->log_writers); - if (ctx) { - index = root->log_transid % 2; - list_add_tail(&ctx->list, &root->log_ctxs[index]); - ctx->log_transid = root->log_transid; - } - mutex_unlock(&root->log_mutex); - return 0; - } - - ret = 0; - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, root->fs_info); - mutex_unlock(&root->fs_info->tree_log_mutex); - if (ret) - goto out; - - if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) goto out; + + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } - clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); - root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx) { - index = root->log_transid % 2; + int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; } + out: mutex_unlock(&root->log_mutex); return ret; @@ -238,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } @@ -700,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); if (ret) goto out; } else { @@ -731,11 +724,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, &ordered_sums, 0); if (ret) goto out; + /* + * Now delete all existing cums in the csum root that + * cover our range. We do this because we can have an + * extent that is completely referenced by one file + * extent item and partially referenced by another + * file extent item (like after using the clone or + * extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the + * extent first, and we do not do the csum deletion + * below, we can get 2 csum items in the csum tree that + * overlap each other. For example, imagine our log has + * the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent + * that starts at disk byte 12845056, and the log tree + * has a single csum item that covers the entire range + * of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the + * csum tree gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K + * of our extent. Now when we replay the second file + * extent item, if we do not delete existing csum items + * that cover any of its blocks, we end up getting two + * csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying + * to lookup up for the checksum of any block of our + * extent starting at an offset of 40K or higher, will + * end up looking at the second csum item only, which + * does not contain the checksum for any block starting + * at offset 40K or higher of our extent. + */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); + if (!ret) + ret = btrfs_del_csums(trans, + root->fs_info->csum_root, + sums->bytenr, + sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, @@ -1549,9 +1596,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, u64 index, - char *name, int name_len, u8 type, + char *name, int name_len, struct btrfs_key *location) { struct inode *inode; @@ -1613,6 +1659,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root, * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. + * + * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a + * non-existing inode) and 1 if the name was replayed. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1631,6 +1680,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, int exists; int ret = 0; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool name_added = false; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1708,6 +1758,8 @@ out: } kfree(name); iput(dir); + if (!ret && name_added) + ret = 1; return ret; insert: @@ -1719,10 +1771,12 @@ insert: goto out; } btrfs_release_path(path); - ret = insert_one_name(trans, root, path, key->objectid, key->offset, - name, name_len, log_type, &log_key); + ret = insert_one_name(trans, root, key->objectid, key->offset, + name, name_len, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; + if (!ret) + name_added = true; update_size = false; ret = 0; goto out; @@ -1740,12 +1794,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret; + int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; int name_len; unsigned long ptr; unsigned long ptr_end; + struct btrfs_path *fixup_path = NULL; ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; @@ -1755,12 +1810,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - if (ret) - return ret; + if (ret < 0) + break; ptr = (unsigned long)(di + 1); ptr += name_len; + + /* + * If this entry refers to a non-directory (directories can not + * have a link count > 1) and it was added in the transaction + * that was not committed, make sure we fixup the link count of + * the inode it the entry points to. Otherwise something like + * the following would result in a directory pointing to an + * inode with a wrong link that does not account for this dir + * entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two + * entries pointing to it in the directory testdir. This would + * make it impossible to ever delete the parent directory has + * it would result in stale dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_key di_key; + + if (!fixup_path) { + fixup_path = btrfs_alloc_path(); + if (!fixup_path) { + ret = -ENOMEM; + break; + } + } + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, + di_key.objectid); + if (ret) + break; + } + ret = 0; } - return 0; + btrfs_free_path(fixup_path); + return ret; } /* @@ -2535,8 +2637,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static void wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int transid) +static void wait_log_commit(struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2561,8 +2662,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, atomic_read(&root->log_commit[index])); } -static void wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); @@ -2642,7 +2742,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, log_transid); + wait_log_commit(root, log_transid); mutex_unlock(&root->log_mutex); return ctx->log_ret; } @@ -2651,7 +2751,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, log_transid - 1); + wait_log_commit(root, log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); @@ -2662,7 +2762,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } - wait_for_writer(trans, root); + wait_for_writer(root); if (batch == atomic_read(&root->log_batch)) break; } @@ -2722,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } @@ -2759,7 +2861,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_wait_logged_extents(trans, log, log_transid); - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); if (!ret) @@ -2770,11 +2872,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid - 1); } - wait_for_writer(trans, log_root_tree); + wait_for_writer(log_root_tree); /* * now that we've moved on to the tree of log tree roots, @@ -2852,6 +2954,9 @@ out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: @@ -2863,6 +2968,9 @@ out: atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -3881,12 +3989,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) @@ -4123,6 +4225,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4161,6 +4444,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4269,11 +4553,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4296,6 +4575,28 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4363,9 +4664,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed @@ -4694,6 +5012,94 @@ next_dir_inode: return ret; } +static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_log_ctx *ctx) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(inode)->root; + const u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_locking = 1; + path->search_commit_root = 1; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + unsigned long ptr; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; + struct inode *dir_inode; + + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + inode_key.objectid = btrfs_inode_extref_parent( + leaf, extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + inode_key.objectid = key.offset; + cur_offset = item_size; + } + + dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, + root, NULL); + /* If parent inode was deleted, skip it. */ + if (IS_ERR(dir_inode)) + continue; + + ret = btrfs_log_inode(trans, root, dir_inode, + LOG_INODE_ALL, 0, LLONG_MAX, ctx); + iput(dir_inode); + if (ret) + goto out; + } + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4713,9 +5119,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; - const struct dentry * const first_parent = parent; - const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > - last_committed); bool log_dentries = false; struct inode *orig_inode = inode; @@ -4776,6 +5179,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; + /* + * On unlink we must make sure all our current and old parent directores + * inodes are fully logged. This is to prevent leaving dangling + * directory index entries in directories that were our parents but are + * not anymore. Not doing this results in old parent directory being + * impossible to delete after log replay (rmdir will always fail with + * error -ENOTEMPTY). + * + * Example 1: + * + * mkdir testdir + * touch testdir/foo + * ln testdir/foo testdir/bar + * sync + * unlink testdir/bar + * xfs_io -c fsync testdir/foo + * + * mount fs, triggers log replay + * + * If we don't log the parent directory (testdir), after log replay the + * directory still has an entry pointing to the file inode using the bar + * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and + * the file inode has a link count of 1. + * + * Example 2: + * + * mkdir testdir + * touch foo + * ln foo testdir/foo2 + * ln foo testdir/foo3 + * sync + * unlink testdir/foo3 + * xfs_io -c fsync foo + * + * mount fs, triggers log replay + * + * Similar as the first example, after log replay the parent directory + * testdir still has an entry pointing to the inode file with name foo3 + * but the file inode does not have a matching BTRFS_INODE_REF_KEY item + * and has a link count of 2. + */ + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + ret = btrfs_log_all_parents(trans, orig_inode, ctx); + if (ret) + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; @@ -4784,23 +5234,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; - /* - * On unlink we must make sure our immediate parent directory - * inode is fully logged. This is to prevent leaving dangling - * directory index entries and a wrong directory inode's i_size. - * Not doing so can result in a directory being impossible to - * delete after log replay (rmdir will always fail with error - * -ENOTEMPTY). - */ - if (did_unlink && parent == first_parent) - inode_only = LOG_INODE_ALL; - else - inode_only = LOG_INODE_EXISTS; - - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed || - inode_only == LOG_INODE_ALL) { - ret = btrfs_log_inode(trans, root, inode, inode_only, + if (BTRFS_I(inode)->generation > last_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; @@ -4888,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " + btrfs_std_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -4902,7 +5338,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -4920,7 +5356,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -4935,7 +5371,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_error(fs_info, ret, "Couldn't read target root " + btrfs_std_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; }