These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / send.c
index a1216f9..63a6152 100644 (file)
@@ -243,6 +243,7 @@ struct waiting_dir_move {
         * after this directory is moved, we can try to rmdir the ino rmdir_ino.
         */
        u64 rmdir_ino;
+       bool orphanized;
 };
 
 struct orphan_dir_info {
@@ -1158,6 +1159,9 @@ struct backref_ctx {
        /* may be truncated in case it's the last extent in a file */
        u64 extent_len;
 
+       /* data offset in the file extent item */
+       u64 data_offset;
+
        /* Just to check for bugs in backref resolving */
        int found_itself;
 };
@@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
        if (ret < 0)
                return ret;
 
-       if (offset + bctx->extent_len > i_size)
+       if (offset + bctx->data_offset + bctx->extent_len > i_size)
                return 0;
 
        /*
@@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx,
        backref_ctx->cur_offset = data_offset;
        backref_ctx->found_itself = 0;
        backref_ctx->extent_len = num_bytes;
+       /*
+        * For non-compressed extents iterate_extent_inodes() gives us extent
+        * offsets that already take into account the data offset, but not for
+        * compressed extents, since the offset is logical and not relative to
+        * the physical extent locations. We must take this into account to
+        * avoid sending clone offsets that go beyond the source file's size,
+        * which would result in the clone ioctl failing with -EINVAL on the
+        * receiving end.
+        */
+       if (compressed == BTRFS_COMPRESS_NONE)
+               backref_ctx->data_offset = 0;
+       else
+               backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
 
        /*
         * The last extent of a file may be too large due to page alignment.
@@ -1417,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
        }
 
        if (cur_clone_root) {
-               if (compressed != BTRFS_COMPRESS_NONE) {
-                       /*
-                        * Offsets given by iterate_extent_inodes() are relative
-                        * to the start of the extent, we need to add logical
-                        * offset from the file extent item.
-                        * (See why at backref.c:check_extent_in_eb())
-                        */
-                       cur_clone_root->offset += btrfs_file_extent_offset(eb,
-                                                                          fi);
-               }
                *found = cur_clone_root;
                ret = 0;
        } else {
@@ -1462,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
-       BUG_ON(ret);
+       if (ret) {
+               /*
+                * An empty symlink inode. Can happen in rare error paths when
+                * creating a symlink (transaction committed before the inode
+                * eviction handler removed the symlink inode items and a crash
+                * happened in between or the subvol was snapshoted in between).
+                * Print an informative message to dmesg/syslog so that the user
+                * can delete the symlink.
+                */
+               btrfs_err(root->fs_info,
+                         "Found empty symlink inode %llu at root %llu",
+                         ino, root->root_key.objectid);
+               ret = -EIO;
+               goto out;
+       }
 
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
@@ -1900,8 +1921,15 @@ static int did_overwrite_ref(struct send_ctx *sctx,
                goto out;
        }
 
-       /* we know that it is or will be overwritten. check this now */
-       if (ow_inode < sctx->send_progress)
+       /*
+        * We know that it is or will be overwritten. Check this now.
+        * The current inode being processed might have been the one that caused
+        * inode 'ino' to be orphanized, therefore check if ow_inode matches
+        * the current inode being processed.
+        */
+       if ((ow_inode < sctx->send_progress) ||
+           (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+            gen == sctx->cur_inode_gen))
                ret = 1;
        else
                ret = 0;
@@ -2223,6 +2251,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        fs_path_reset(dest);
 
        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+               struct waiting_dir_move *wdm;
+
                fs_path_reset(name);
 
                if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2263,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
                        break;
                }
 
-               if (is_waiting_for_move(sctx, ino)) {
+               wdm = get_waiting_dir_move(sctx, ino);
+               if (wdm && wdm->orphanized) {
+                       ret = gen_unique_name(sctx, ino, gen, name);
+                       stop = 1;
+               } else if (wdm) {
                        ret = get_first_ref(sctx->parent_root, ino,
                                            &parent_inode, &parent_gen, name);
                } else {
@@ -2323,13 +2357,23 @@ static int send_subvol_begin(struct send_ctx *sctx)
        }
 
        TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
-       TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
-                       sctx->send_root->root_item.uuid);
+
+       if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+                           sctx->send_root->root_item.received_uuid);
+       else
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+                           sctx->send_root->root_item.uuid);
+
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
                    le64_to_cpu(sctx->send_root->root_item.ctransid));
        if (parent_root) {
-               TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-                               sctx->parent_root->root_item.uuid);
+               if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+                       TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+                                    parent_root->root_item.received_uuid);
+               else
+                       TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+                                    parent_root->root_item.uuid);
                TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
                            le64_to_cpu(sctx->parent_root->root_item.ctransid));
        }
@@ -2530,7 +2574,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
        } else if (S_ISSOCK(mode)) {
                cmd = BTRFS_SEND_C_MKSOCK;
        } else {
-               printk(KERN_WARNING "btrfs: unexpected inode type %o",
+               btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
                                (int)(mode & S_IFMT));
                ret = -ENOTSUPP;
                goto out;
@@ -2923,7 +2967,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
        return entry != NULL;
 }
 
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
 {
        struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
        struct rb_node *parent = NULL;
@@ -2934,6 +2978,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
                return -ENOMEM;
        dm->ino = ino;
        dm->rmdir_ino = 0;
+       dm->orphanized = orphanized;
 
        while (*p) {
                parent = *p;
@@ -3030,7 +3075,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
                        goto out;
        }
 
-       ret = add_waiting_dir_move(sctx, pm->ino);
+       ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
        if (ret)
                goto out;
 
@@ -3353,8 +3398,40 @@ out:
        return ret;
 }
 
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+                      const u64 ino1,
+                      const u64 ino1_gen,
+                      const u64 ino2,
+                      struct fs_path *fs_path)
+{
+       u64 ino = ino2;
+
+       while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+               int ret;
+               u64 parent;
+               u64 parent_gen;
+
+               fs_path_reset(fs_path);
+               ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+               if (ret < 0) {
+                       if (ret == -ENOENT && ino == ino2)
+                               ret = 0;
+                       return ret;
+               }
+               if (parent == ino1)
+                       return parent_gen == ino1_gen ? 1 : 0;
+               ino = parent;
+       }
+       return 0;
+}
+
 static int wait_for_parent_move(struct send_ctx *sctx,
-                               struct recorded_ref *parent_ref)
+                               struct recorded_ref *parent_ref,
+                               const bool is_orphan)
 {
        int ret = 0;
        u64 ino = parent_ref->dir;
@@ -3374,11 +3451,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
         * Our current directory inode may not yet be renamed/moved because some
         * ancestor (immediate or not) has to be renamed/moved first. So find if
         * such ancestor exists and make sure our own rename/move happens after
-        * that ancestor is processed.
+        * that ancestor is processed to avoid path build infinite loops (done
+        * at get_cur_path()).
         */
        while (ino > BTRFS_FIRST_FREE_OBJECTID) {
                if (is_waiting_for_move(sctx, ino)) {
-                       ret = 1;
+                       /*
+                        * If the current inode is an ancestor of ino in the
+                        * parent root, we need to delay the rename of the
+                        * current inode, otherwise don't delayed the rename
+                        * because we can end up with a circular dependency
+                        * of renames, resulting in some directories never
+                        * getting the respective rename operations issued in
+                        * the send stream or getting into infinite path build
+                        * loops.
+                        */
+                       ret = is_ancestor(sctx->parent_root,
+                                         sctx->cur_ino, sctx->cur_inode_gen,
+                                         ino, path_before);
                        break;
                }
 
@@ -3420,7 +3510,7 @@ out:
                                           ino,
                                           &sctx->new_refs,
                                           &sctx->deleted_refs,
-                                          false);
+                                          is_orphan);
                if (!ret)
                        ret = 1;
        }
@@ -3589,6 +3679,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        }
                }
 
+               if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+                   can_rename) {
+                       ret = wait_for_parent_move(sctx, cur, is_orphan);
+                       if (ret < 0)
+                               goto out;
+                       if (ret == 1) {
+                               can_rename = false;
+                               *pending_move = 1;
+                       }
+               }
+
                /*
                 * link/move the ref to the new place. If we have an orphan
                 * inode, move it and update valid_path. If not, link or move
@@ -3609,18 +3710,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                 * dirs, we always have one new and one deleted
                                 * ref. The deleted ref is ignored later.
                                 */
-                               ret = wait_for_parent_move(sctx, cur);
-                               if (ret < 0)
-                                       goto out;
-                               if (ret) {
-                                       *pending_move = 1;
-                               } else {
-                                       ret = send_rename(sctx, valid_path,
-                                                         cur->full_path);
-                                       if (!ret)
-                                               ret = fs_path_copy(valid_path,
-                                                              cur->full_path);
-                               }
+                               ret = send_rename(sctx, valid_path,
+                                                 cur->full_path);
+                               if (!ret)
+                                       ret = fs_path_copy(valid_path,
+                                                          cur->full_path);
                                if (ret < 0)
                                        goto out;
                        } else {
@@ -4508,8 +4602,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
        if (ret < 0)
                goto out;
 
-       TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-                       clone_root->root->root_item.uuid);
+       /*
+        * If the parent we're using has a received_uuid set then use that as
+        * our clone source as that is what we will look for when doing a
+        * receive.
+        *
+        * This covers the case that we create a snapshot off of a received
+        * subvolume and then use that as the parent and try to receive on a
+        * different host.
+        */
+       if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+                            clone_root->root->root_item.received_uuid);
+       else
+               TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+                            clone_root->root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
                    le64_to_cpu(clone_root->root->root_item.ctransid));
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
@@ -4590,6 +4697,171 @@ tlv_put_failure:
        return ret;
 }
 
+static int send_extent_data(struct send_ctx *sctx,
+                           const u64 offset,
+                           const u64 len)
+{
+       u64 sent = 0;
+
+       if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+               return send_update_extent(sctx, offset, len);
+
+       while (sent < len) {
+               u64 size = len - sent;
+               int ret;
+
+               if (size > BTRFS_SEND_READ_SIZE)
+                       size = BTRFS_SEND_READ_SIZE;
+               ret = send_write(sctx, offset + sent, size);
+               if (ret < 0)
+                       return ret;
+               if (!ret)
+                       break;
+               sent += ret;
+       }
+       return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+                      struct clone_root *clone_root,
+                      const u64 disk_byte,
+                      u64 data_offset,
+                      u64 offset,
+                      u64 len)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret;
+
+       path = alloc_path_for_send();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * We can't send a clone operation for the entire range if we find
+        * extent items in the respective range in the source file that
+        * refer to different extents or if we find holes.
+        * So check for that and do a mix of clone and regular write/copy
+        * operations if needed.
+        *
+        * Example:
+        *
+        * mkfs.btrfs -f /dev/sda
+        * mount /dev/sda /mnt
+        * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+        * cp --reflink=always /mnt/foo /mnt/bar
+        * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+        * btrfs subvolume snapshot -r /mnt /mnt/snap
+        *
+        * If when we send the snapshot and we are processing file bar (which
+        * has a higher inode number than foo) we blindly send a clone operation
+        * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+        * a file bar that matches the content of file foo - iow, doesn't match
+        * the content from bar in the original filesystem.
+        */
+       key.objectid = clone_root->ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = clone_root->offset;
+       ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       if (ret > 0 && path->slots[0] > 0) {
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+               if (key.objectid == clone_root->ino &&
+                   key.type == BTRFS_EXTENT_DATA_KEY)
+                       path->slots[0]--;
+       }
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               struct btrfs_file_extent_item *ei;
+               u8 type;
+               u64 ext_len;
+               u64 clone_len;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(clone_root->root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+
+               /*
+                * We might have an implicit trailing hole (NO_HOLES feature
+                * enabled). We deal with it after leaving this loop.
+                */
+               if (key.objectid != clone_root->ino ||
+                   key.type != BTRFS_EXTENT_DATA_KEY)
+                       break;
+
+               ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+               type = btrfs_file_extent_type(leaf, ei);
+               if (type == BTRFS_FILE_EXTENT_INLINE) {
+                       ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+                       ext_len = PAGE_CACHE_ALIGN(ext_len);
+               } else {
+                       ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+               }
+
+               if (key.offset + ext_len <= clone_root->offset)
+                       goto next;
+
+               if (key.offset > clone_root->offset) {
+                       /* Implicit hole, NO_HOLES feature enabled. */
+                       u64 hole_len = key.offset - clone_root->offset;
+
+                       if (hole_len > len)
+                               hole_len = len;
+                       ret = send_extent_data(sctx, offset, hole_len);
+                       if (ret < 0)
+                               goto out;
+
+                       len -= hole_len;
+                       if (len == 0)
+                               break;
+                       offset += hole_len;
+                       clone_root->offset += hole_len;
+                       data_offset += hole_len;
+               }
+
+               if (key.offset >= clone_root->offset + len)
+                       break;
+
+               clone_len = min_t(u64, ext_len, len);
+
+               if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+                   btrfs_file_extent_offset(leaf, ei) == data_offset)
+                       ret = send_clone(sctx, offset, clone_len, clone_root);
+               else
+                       ret = send_extent_data(sctx, offset, clone_len);
+
+               if (ret < 0)
+                       goto out;
+
+               len -= clone_len;
+               if (len == 0)
+                       break;
+               offset += clone_len;
+               clone_root->offset += clone_len;
+               data_offset += clone_len;
+next:
+               path->slots[0]++;
+       }
+
+       if (len > 0)
+               ret = send_extent_data(sctx, offset, len);
+       else
+               ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
 static int send_write_or_clone(struct send_ctx *sctx,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
@@ -4598,9 +4870,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
        int ret = 0;
        struct btrfs_file_extent_item *ei;
        u64 offset = key->offset;
-       u64 pos = 0;
        u64 len;
-       u32 l;
        u8 type;
        u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
 
@@ -4628,22 +4898,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
        }
 
        if (clone_root && IS_ALIGNED(offset + len, bs)) {
-               ret = send_clone(sctx, offset, len, clone_root);
-       } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
-               ret = send_update_extent(sctx, offset, len);
+               u64 disk_byte;
+               u64 data_offset;
+
+               disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+               data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+               ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+                                 offset, len);
        } else {
-               while (pos < len) {
-                       l = len - pos;
-                       if (l > BTRFS_SEND_READ_SIZE)
-                               l = BTRFS_SEND_READ_SIZE;
-                       ret = send_write(sctx, pos + offset, l);
-                       if (ret < 0)
-                               goto out;
-                       if (!ret)
-                               break;
-                       pos += ret;
-               }
-               ret = 0;
+               ret = send_extent_data(sctx, offset, len);
        }
 out:
        return ret;