These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / drivers / block / loop.c
index cef6fa8..423f4ca 100644 (file)
@@ -164,6 +164,62 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
        return get_size(lo->lo_offset, lo->lo_sizelimit, file);
 }
 
+static void __loop_update_dio(struct loop_device *lo, bool dio)
+{
+       struct file *file = lo->lo_backing_file;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned short sb_bsize = 0;
+       unsigned dio_align = 0;
+       bool use_dio;
+
+       if (inode->i_sb->s_bdev) {
+               sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
+               dio_align = sb_bsize - 1;
+       }
+
+       /*
+        * We support direct I/O only if lo_offset is aligned with the
+        * logical I/O size of backing device, and the logical block
+        * size of loop is bigger than the backing device's and the loop
+        * needn't transform transfer.
+        *
+        * TODO: the above condition may be loosed in the future, and
+        * direct I/O may be switched runtime at that time because most
+        * of requests in sane appplications should be PAGE_SIZE algined
+        */
+       if (dio) {
+               if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
+                               !(lo->lo_offset & dio_align) &&
+                               mapping->a_ops->direct_IO &&
+                               !lo->transfer)
+                       use_dio = true;
+               else
+                       use_dio = false;
+       } else {
+               use_dio = false;
+       }
+
+       if (lo->use_dio == use_dio)
+               return;
+
+       /* flush dirty pages before changing direct IO */
+       vfs_fsync(file, 0);
+
+       /*
+        * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
+        * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
+        * will get updated by ioctl(LOOP_GET_STATUS)
+        */
+       blk_mq_freeze_queue(lo->lo_queue);
+       lo->use_dio = use_dio;
+       if (use_dio)
+               lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+       else
+               lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+       blk_mq_unfreeze_queue(lo->lo_queue);
+}
+
 static int
 figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
@@ -389,6 +445,89 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
        return ret;
 }
 
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+       if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+               return;
+
+       if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+               struct bio *bio = cmd->rq->bio;
+
+               bio_advance(bio, bytes);
+               zero_fill_bio(bio);
+       }
+}
+
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+       struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+       struct request *rq = cmd->rq;
+
+       handle_partial_read(cmd, ret);
+
+       if (ret > 0)
+               ret = 0;
+       else if (ret < 0)
+               ret = -EIO;
+
+       blk_mq_complete_request(rq, ret);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+                    loff_t pos, bool rw)
+{
+       struct iov_iter iter;
+       struct bio_vec *bvec;
+       struct bio *bio = cmd->rq->bio;
+       struct file *file = lo->lo_backing_file;
+       int ret;
+
+       /* nomerge for loop request queue */
+       WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+       bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+       iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+                     bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+       cmd->iocb.ki_pos = pos;
+       cmd->iocb.ki_filp = file;
+       cmd->iocb.ki_complete = lo_rw_aio_complete;
+       cmd->iocb.ki_flags = IOCB_DIRECT;
+
+       if (rw == WRITE)
+               ret = file->f_op->write_iter(&cmd->iocb, &iter);
+       else
+               ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+       if (ret != -EIOCBQUEUED)
+               cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+       return 0;
+}
+
+
+static inline int lo_rw_simple(struct loop_device *lo,
+               struct request *rq, loff_t pos, bool rw)
+{
+       struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+       if (cmd->use_aio)
+               return lo_rw_aio(lo, cmd, pos, rw);
+
+       /*
+        * lo_write_simple and lo_read_simple should have been covered
+        * by io submit style function like lo_rw_aio(), one blocker
+        * is that lo_read_simple() need to call flush_dcache_page after
+        * the page is written from kernel, and it isn't easy to handle
+        * this in io submit style function which submits all segments
+        * of the req at one time. And direct read IO doesn't need to
+        * run flush_dcache_page().
+        */
+       if (rw == WRITE)
+               return lo_write_simple(lo, rq, pos);
+       else
+               return lo_read_simple(lo, rq, pos);
+}
+
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
        loff_t pos;
@@ -404,13 +543,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
                else if (lo->transfer)
                        ret = lo_write_transfer(lo, rq, pos);
                else
-                       ret = lo_write_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, WRITE);
 
        } else {
                if (lo->transfer)
                        ret = lo_read_transfer(lo, rq, pos);
                else
-                       ret = lo_read_simple(lo, rq, pos);
+                       ret = lo_rw_simple(lo, rq, pos, READ);
        }
 
        return ret;
@@ -421,6 +560,12 @@ struct switch_request {
        struct completion wait;
 };
 
+static inline void loop_update_dio(struct loop_device *lo)
+{
+       __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
+                       lo->use_dio);
+}
+
 /*
  * Do the actual switch; called from the BIO completion routine
  */
@@ -441,6 +586,7 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
                mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+       loop_update_dio(lo);
 }
 
 /*
@@ -474,6 +620,28 @@ static int loop_flush(struct loop_device *lo)
        return loop_switch(lo, NULL);
 }
 
+static void loop_reread_partitions(struct loop_device *lo,
+                                  struct block_device *bdev)
+{
+       int rc;
+
+       /*
+        * bd_mutex has been held already in release path, so don't
+        * acquire it if this function is called in such case.
+        *
+        * If the reread partition isn't from release path, lo_refcnt
+        * must be at least one and it can only become zero when the
+        * current holder is released.
+        */
+       if (!atomic_read(&lo->lo_refcnt))
+               rc = __blkdev_reread_part(bdev);
+       else
+               rc = blkdev_reread_part(bdev);
+       if (rc)
+               pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
+                       __func__, lo->lo_number, lo->lo_file_name, rc);
+}
+
 /*
  * loop_change_fd switched the backing store of a loopback device to
  * a new file. This is useful for operating system installers to free up
@@ -522,7 +690,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 
        fput(old_file);
        if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-               ioctl_by_bdev(bdev, BLKRRPART, 0);
+               loop_reread_partitions(lo, bdev);
        return 0;
 
  out_putf:
@@ -566,7 +734,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 
        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
-               p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+               p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
        spin_unlock_irq(&lo->lo_lock);
 
        if (IS_ERR_OR_NULL(p))
@@ -605,11 +773,19 @@ static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
        return sprintf(buf, "%s\n", partscan ? "1" : "0");
 }
 
+static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
+{
+       int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
+
+       return sprintf(buf, "%s\n", dio ? "1" : "0");
+}
+
 LOOP_ATTR_RO(backing_file);
 LOOP_ATTR_RO(offset);
 LOOP_ATTR_RO(sizelimit);
 LOOP_ATTR_RO(autoclear);
 LOOP_ATTR_RO(partscan);
+LOOP_ATTR_RO(dio);
 
 static struct attribute *loop_attrs[] = {
        &loop_attr_backing_file.attr,
@@ -617,6 +793,7 @@ static struct attribute *loop_attrs[] = {
        &loop_attr_sizelimit.attr,
        &loop_attr_autoclear.attr,
        &loop_attr_partscan.attr,
+       &loop_attr_dio.attr,
        NULL,
 };
 
@@ -653,7 +830,7 @@ static void loop_config_discard(struct loop_device *lo)
            lo->lo_encrypt_key_size) {
                q->limits.discard_granularity = 0;
                q->limits.discard_alignment = 0;
-               q->limits.max_discard_sectors = 0;
+               blk_queue_max_discard_sectors(q, 0);
                q->limits.discard_zeroes_data = 0;
                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
                return;
@@ -661,11 +838,28 @@ static void loop_config_discard(struct loop_device *lo)
 
        q->limits.discard_granularity = inode->i_sb->s_blocksize;
        q->limits.discard_alignment = 0;
-       q->limits.max_discard_sectors = UINT_MAX >> 9;
+       blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
        q->limits.discard_zeroes_data = 1;
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
+static void loop_unprepare_queue(struct loop_device *lo)
+{
+       flush_kthread_worker(&lo->worker);
+       kthread_stop(lo->worker_task);
+}
+
+static int loop_prepare_queue(struct loop_device *lo)
+{
+       init_kthread_worker(&lo->worker);
+       lo->worker_task = kthread_run(kthread_worker_fn,
+                       &lo->worker, "loop%d", lo->lo_number);
+       if (IS_ERR(lo->worker_task))
+               return -ENOMEM;
+       set_user_nice(lo->worker_task, MIN_NICE);
+       return 0;
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
                       struct block_device *bdev, unsigned int arg)
 {
@@ -723,17 +917,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
        size = get_loop_size(lo, file);
        if ((loff_t)(sector_t)size != size)
                goto out_putf;
-       error = -ENOMEM;
-       lo->wq = alloc_workqueue("kloopd%d",
-                       WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
-                       lo->lo_number);
-       if (!lo->wq)
+       error = loop_prepare_queue(lo);
+       if (error)
                goto out_putf;
 
        error = 0;
 
        set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
+       lo->use_dio = false;
        lo->lo_blocksize = lo_blocksize;
        lo->lo_device = bdev;
        lo->lo_flags = lo_flags;
@@ -747,6 +939,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
        if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
                blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
+       loop_update_dio(lo);
        set_capacity(lo->lo_disk, size);
        bd_set_size(bdev, size << 9);
        loop_sysfs_init(lo);
@@ -759,7 +952,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
        if (part_shift)
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
        if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-               ioctl_by_bdev(bdev, BLKRRPART, 0);
+               loop_reread_partitions(lo, bdev);
 
        /* Grab the block_device to prevent its destruction after we
         * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
@@ -831,7 +1024,7 @@ static int loop_clr_fd(struct loop_device *lo)
         * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
         * command to fail with EBUSY.
         */
-       if (lo->lo_refcnt > 1) {
+       if (atomic_read(&lo->lo_refcnt) > 1) {
                lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
                mutex_unlock(&lo->lo_ctl_mutex);
                return 0;
@@ -840,6 +1033,9 @@ static int loop_clr_fd(struct loop_device *lo)
        if (filp == NULL)
                return -EINVAL;
 
+       /* freeze request queue during the transition */
+       blk_mq_freeze_queue(lo->lo_queue);
+
        spin_lock_irq(&lo->lo_lock);
        lo->lo_state = Lo_rundown;
        lo->lo_backing_file = NULL;
@@ -871,13 +1067,14 @@ static int loop_clr_fd(struct loop_device *lo)
        lo->lo_state = Lo_unbound;
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
+       blk_mq_unfreeze_queue(lo->lo_queue);
+
        if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
-               ioctl_by_bdev(bdev, BLKRRPART, 0);
+               loop_reread_partitions(lo, bdev);
        lo->lo_flags = 0;
        if (!part_shift)
                lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
-       destroy_workqueue(lo->wq);
-       lo->wq = NULL;
+       loop_unprepare_queue(lo);
        mutex_unlock(&lo->lo_ctl_mutex);
        /*
         * Need not hold lo_ctl_mutex to fput backing file.
@@ -949,7 +1146,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
             !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
                lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
-               ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+               loop_reread_partitions(lo, lo->lo_device);
        }
 
        lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
@@ -961,6 +1158,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
                lo->lo_key_owner = uid;
        }
 
+       /* update dio if lo_offset or transfer is changed */
+       __loop_update_dio(lo, lo->use_dio);
+
        return 0;
 }
 
@@ -1111,6 +1311,20 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
        return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 }
 
+static int loop_set_dio(struct loop_device *lo, unsigned long arg)
+{
+       int error = -ENXIO;
+       if (lo->lo_state != Lo_bound)
+               goto out;
+
+       __loop_update_dio(lo, !!arg);
+       if (lo->use_dio == !!arg)
+               return 0;
+       error = -EINVAL;
+ out:
+       return error;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
        unsigned int cmd, unsigned long arg)
 {
@@ -1154,6 +1368,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
                if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_capacity(lo, bdev);
                break;
+       case LOOP_SET_DIRECT_IO:
+               err = -EPERM;
+               if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+                       err = loop_set_dio(lo, arg);
+               break;
        default:
                err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
        }
@@ -1330,9 +1549,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
                goto out;
        }
 
-       mutex_lock(&lo->lo_ctl_mutex);
-       lo->lo_refcnt++;
-       mutex_unlock(&lo->lo_ctl_mutex);
+       atomic_inc(&lo->lo_refcnt);
 out:
        mutex_unlock(&loop_index_mutex);
        return err;
@@ -1343,11 +1560,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
        struct loop_device *lo = disk->private_data;
        int err;
 
-       mutex_lock(&lo->lo_ctl_mutex);
-
-       if (--lo->lo_refcnt)
-               goto out;
+       if (atomic_dec_return(&lo->lo_refcnt))
+               return;
 
+       mutex_lock(&lo->lo_ctl_mutex);
        if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
                /*
                 * In autoclear mode, stop the loop thread
@@ -1364,7 +1580,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
                loop_flush(lo);
        }
 
-out:
        mutex_unlock(&lo->lo_ctl_mutex);
 }
 
@@ -1438,23 +1653,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (lo->lo_state != Lo_bound)
                return -EIO;
 
-       if (cmd->rq->cmd_flags & REQ_WRITE) {
-               struct loop_device *lo = cmd->rq->q->queuedata;
-               bool need_sched = true;
-
-               spin_lock_irq(&lo->lo_lock);
-               if (lo->write_started)
-                       need_sched = false;
-               else
-                       lo->write_started = true;
-               list_add_tail(&cmd->list, &lo->write_cmd_head);
-               spin_unlock_irq(&lo->lo_lock);
+       if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
+                                       REQ_DISCARD)))
+               cmd->use_aio = true;
+       else
+               cmd->use_aio = false;
 
-               if (need_sched)
-                       queue_work(lo->wq, &lo->write_work);
-       } else {
-               queue_work(lo->wq, &cmd->read_work);
-       }
+       queue_kthread_work(&lo->worker, &cmd->work);
 
        return BLK_MQ_RQ_QUEUE_OK;
 }
@@ -1463,48 +1668,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 {
        const bool write = cmd->rq->cmd_flags & REQ_WRITE;
        struct loop_device *lo = cmd->rq->q->queuedata;
-       int ret = -EIO;
+       int ret = 0;
 
-       if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+       if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
+               ret = -EIO;
                goto failed;
+       }
 
        ret = do_req_filebacked(lo, cmd->rq);
-
  failed:
-       if (ret)
-               cmd->rq->errors = -EIO;
-       blk_mq_complete_request(cmd->rq);
-}
-
-static void loop_queue_write_work(struct work_struct *work)
-{
-       struct loop_device *lo =
-               container_of(work, struct loop_device, write_work);
-       LIST_HEAD(cmd_list);
-
-       spin_lock_irq(&lo->lo_lock);
- repeat:
-       list_splice_init(&lo->write_cmd_head, &cmd_list);
-       spin_unlock_irq(&lo->lo_lock);
-
-       while (!list_empty(&cmd_list)) {
-               struct loop_cmd *cmd = list_first_entry(&cmd_list,
-                               struct loop_cmd, list);
-               list_del_init(&cmd->list);
-               loop_handle_cmd(cmd);
-       }
-
-       spin_lock_irq(&lo->lo_lock);
-       if (!list_empty(&lo->write_cmd_head))
-               goto repeat;
-       lo->write_started = false;
-       spin_unlock_irq(&lo->lo_lock);
+       /* complete non-aio request */
+       if (!cmd->use_aio || ret)
+               blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
 }
 
-static void loop_queue_read_work(struct work_struct *work)
+static void loop_queue_work(struct kthread_work *work)
 {
        struct loop_cmd *cmd =
-               container_of(work, struct loop_cmd, read_work);
+               container_of(work, struct loop_cmd, work);
 
        loop_handle_cmd(cmd);
 }
@@ -1516,7 +1697,7 @@ static int loop_init_request(void *data, struct request *rq,
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
        cmd->rq = rq;
-       INIT_WORK(&cmd->read_work, loop_queue_read_work);
+       init_kthread_work(&cmd->work, loop_queue_work);
 
        return 0;
 }
@@ -1572,8 +1753,11 @@ static int loop_add(struct loop_device **l, int i)
        }
        lo->lo_queue->queuedata = lo;
 
-       INIT_LIST_HEAD(&lo->write_cmd_head);
-       INIT_WORK(&lo->write_work, loop_queue_write_work);
+       /*
+        * It doesn't make sense to enable merge because the I/O
+        * submitted to backing file is handled page by page.
+        */
+       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 
        disk = lo->lo_disk = alloc_disk(1 << part_shift);
        if (!disk)
@@ -1601,6 +1785,7 @@ static int loop_add(struct loop_device **l, int i)
                disk->flags |= GENHD_FL_NO_PART_SCAN;
        disk->flags |= GENHD_FL_EXT_DEVT;
        mutex_init(&lo->lo_ctl_mutex);
+       atomic_set(&lo->lo_refcnt, 0);
        lo->lo_number           = i;
        spin_lock_init(&lo->lo_lock);
        disk->major             = LOOP_MAJOR;
@@ -1718,7 +1903,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
                        mutex_unlock(&lo->lo_ctl_mutex);
                        break;
                }
-               if (lo->lo_refcnt > 0) {
+               if (atomic_read(&lo->lo_refcnt) > 0) {
                        ret = -EBUSY;
                        mutex_unlock(&lo->lo_ctl_mutex);
                        break;