These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / drivers / md / dm-thin.c
diff --git a/kernel/drivers/md/dm-thin.c b/kernel/drivers/md/dm-thin.c

index e22e6c8..a1cc797 100644 (file)
--- a/kernel/drivers/md/dm-thin.c
+++ b/kernel/drivers/md/dm-thin.c
@@ -112,22 +112,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  /*
   * Key building.
   */
-static void build_data_key(struct dm_thin_device *td,
-                          dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+       VIRTUAL,
+       PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+                     dm_block_t b, dm_block_t e, struct dm_cell_key *key)
  {
-       key->virtual = 0;
+       key->virtual = (ls == VIRTUAL);
         key->dev = dm_thin_dev_id(td);
         key->block_begin = b;
-       key->block_end = b + 1ULL;
+       key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+                          struct dm_cell_key *key)
+{
+       build_key(td, PHYSICAL, b, b + 1llu, key);
  }
  
  static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
                               struct dm_cell_key *key)
  {
-       key->virtual = 1;
-       key->dev = dm_thin_dev_id(td);
-       key->block_begin = b;
-       key->block_end = b + 1ULL;
+       build_key(td, VIRTUAL, b, b + 1llu, key);
  }
  
  /*----------------------------------------------------------------*/
@@ -313,6 +321,80 @@ struct thin_c {
  
  /*----------------------------------------------------------------*/
  
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev:      blockdev to issue discard for
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to discard
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @flags:     BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ *    Asynchronously issue a discard request for the sectors in question.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+                                       sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+                                       struct bio *parent_bio)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       int type = REQ_WRITE | REQ_DISCARD;
+       struct bio *bio;
+
+       if (!q || !nr_sects)
+               return -ENXIO;
+
+       if (!blk_queue_discard(q))
+               return -EOPNOTSUPP;
+
+       if (flags & BLKDEV_DISCARD_SECURE) {
+               if (!blk_queue_secdiscard(q))
+                       return -EOPNOTSUPP;
+               type |= REQ_SECURE;
+       }
+
+       /*
+        * Required bio_put occurs in bio_endio thanks to bio_chain below
+        */
+       bio = bio_alloc(gfp_mask, 1);
+       if (!bio)
+               return -ENOMEM;
+
+       bio_chain(bio, parent_bio);
+
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_bdev = bdev;
+       bio->bi_iter.bi_size = nr_sects << 9;
+
+       submit_bio(type, bio);
+
+       return 0;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+       return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+       return block_size_is_power_of_two(pool) ?
+               (b << pool->sectors_per_block_shift) :
+               (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+                        struct bio *parent_bio)
+{
+       sector_t s = block_to_sectors(tc->pool, data_b);
+       sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+       return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+                                           GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
  /*
   * wake_worker() is used when new work is queued and when pool_resume is
   * ready to continue deferred IO processing.
@@ -462,6 +544,7 @@ struct dm_thin_endio_hook {
         struct dm_deferred_entry *all_io_entry;
         struct dm_thin_new_mapping *overwrite_mapping;
         struct rb_node rb_node;
+       struct dm_bio_prison_cell *cell;
  };
  
  static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -474,8 +557,10 @@ static void error_bio_list(struct bio_list *bios, int error)
  {
         struct bio *bio;
  
-       while ((bio = bio_list_pop(bios)))
-               bio_endio(bio, error);
+       while ((bio = bio_list_pop(bios))) {
+               bio->bi_error = error;
+               bio_endio(bio);
+       }
  }
  
  static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
@@ -525,16 +610,21 @@ static void requeue_io(struct thin_c *tc)
         requeue_deferred_cells(tc);
  }
  
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
  {
         struct thin_c *tc;
  
         rcu_read_lock();
         list_for_each_entry_rcu(tc, &pool->active_thins, list)
-               error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+               error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
         rcu_read_unlock();
  }
  
+static void error_retry_list(struct pool *pool)
+{
+       return error_retry_list_with_code(pool, -EIO);
+}
+
  /*
   * This section of code contains the logic for processing a thin device's IO.
   * Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -542,11 +632,6 @@ static void error_retry_list(struct pool *pool)
   * target.
   */
  
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-       return pool->sectors_per_block_shift >= 0;
-}
-
  static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
  {
         struct pool *pool = tc->pool;
@@ -560,6 +645,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
         return block_nr;
  }
  
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+                               dm_block_t *begin, dm_block_t *end)
+{
+       struct pool *pool = tc->pool;
+       sector_t b = bio->bi_iter.bi_sector;
+       sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+       b += pool->sectors_per_block - 1ull; /* so we round up */
+
+       if (block_size_is_power_of_two(pool)) {
+               b >>= pool->sectors_per_block_shift;
+               e >>= pool->sectors_per_block_shift;
+       } else {
+               (void) sector_div(b, pool->sectors_per_block);
+               (void) sector_div(e, pool->sectors_per_block);
+       }
+
+       if (e < b)
+               /* Can happen if the bio is within a single block. */
+               e = b;
+
+       *begin = b;
+       *end = e;
+}
+
  static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
  {
         struct pool *pool = tc->pool;
@@ -648,7 +761,7 @@ struct dm_thin_new_mapping {
         struct list_head list;
  
         bool pass_discard:1;
-       bool definitely_not_shared:1;
+       bool maybe_shared:1;
  
         /*
          * Track quiescing, copying and zeroing preparation actions.  When this
@@ -659,9 +772,9 @@ struct dm_thin_new_mapping {
  
         int err;
         struct thin_c *tc;
-       dm_block_t virt_block;
+       dm_block_t virt_begin, virt_end;
         dm_block_t data_block;
-       struct dm_bio_prison_cell *cell, *cell2;
+       struct dm_bio_prison_cell *cell;
  
         /*
          * If the bio covers the whole area of a block then we can avoid
@@ -701,12 +814,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
         complete_mapping_preparation(m);
  }
  
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
  {
         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
         struct dm_thin_new_mapping *m = h->overwrite_mapping;
  
-       m->err = err;
+       bio->bi_end_io = m->saved_bi_end_io;
+
+       m->err = bio->bi_error;
         complete_mapping_preparation(m);
  }
  
@@ -794,10 +909,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
  
  static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
  {
-       if (m->bio) {
-               m->bio->bi_end_io = m->saved_bi_end_io;
-               atomic_inc(&m->bio->bi_remaining);
-       }
         cell_error(m->tc->pool, m->cell);
         list_del(&m->list);
         mempool_free(m, m->tc->pool->mapping_pool);
@@ -807,15 +918,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
  {
         struct thin_c *tc = m->tc;
         struct pool *pool = tc->pool;
-       struct bio *bio;
+       struct bio *bio = m->bio;
         int r;
  
-       bio = m->bio;
-       if (bio) {
-               bio->bi_end_io = m->saved_bi_end_io;
-               atomic_inc(&bio->bi_remaining);
-       }
-
         if (m->err) {
                 cell_error(pool, m->cell);
                 goto out;
@@ -826,7 +931,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
          * Any I/O for this block arriving after this point will get
          * remapped to it directly.
          */
-       r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+       r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
         if (r) {
                 metadata_operation_failed(pool, "dm_thin_insert_block", r);
                 cell_error(pool, m->cell);
@@ -841,7 +946,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
          */
         if (bio) {
                 inc_remap_and_issue_cell(tc, m->cell, m->data_block);
-               bio_endio(bio, 0);
+               bio_endio(bio);
         } else {
                 inc_all_io_entry(tc->pool, m->cell->holder);
                 remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -853,50 +958,113 @@ out:
         mempool_free(m, pool->mapping_pool);
  }
  
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
  {
         struct thin_c *tc = m->tc;
+       if (m->cell)
+               cell_defer_no_holder(tc, m->cell);
+       mempool_free(m, tc->pool->mapping_pool);
+}
  
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
         bio_io_error(m->bio);
+       free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+       bio_endio(m->bio);
+       free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+       if (r) {
+               metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+               bio_io_error(m->bio);
+       } else
+               bio_endio(m->bio);
+
         cell_defer_no_holder(tc, m->cell);
-       cell_defer_no_holder(tc, m->cell2);
         mempool_free(m, tc->pool->mapping_pool);
  }
  
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
  {
+       /*
+        * We've already unmapped this range of blocks, but before we
+        * passdown we have to check that these blocks are now unused.
+        */
+       int r;
+       bool used = true;
         struct thin_c *tc = m->tc;
+       struct pool *pool = tc->pool;
+       dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
  
-       inc_all_io_entry(tc->pool, m->bio);
-       cell_defer_no_holder(tc, m->cell);
-       cell_defer_no_holder(tc, m->cell2);
+       while (b != end) {
+               /* find start of unmapped run */
+               for (; b < end; b++) {
+                       r = dm_pool_block_is_used(pool->pmd, b, &used);
+                       if (r)
+                               return r;
  
-       if (m->pass_discard)
-               if (m->definitely_not_shared)
-                       remap_and_issue(tc, m->bio, m->data_block);
-               else {
-                       bool used = false;
-                       if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
-                               bio_endio(m->bio, 0);
-                       else
-                               remap_and_issue(tc, m->bio, m->data_block);
+                       if (!used)
+                               break;
                 }
-       else
-               bio_endio(m->bio, 0);
  
-       mempool_free(m, tc->pool->mapping_pool);
+               if (b == end)
+                       break;
+
+               /* find end of run */
+               for (e = b + 1; e != end; e++) {
+                       r = dm_pool_block_is_used(pool->pmd, e, &used);
+                       if (r)
+                               return r;
+
+                       if (used)
+                               break;
+               }
+
+               r = issue_discard(tc, b, e, m->bio);
+               if (r)
+                       return r;
+
+               b = e;
+       }
+
+       return 0;
  }
  
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
  {
         int r;
         struct thin_c *tc = m->tc;
+       struct pool *pool = tc->pool;
  
-       r = dm_thin_remove_block(tc->td, m->virt_block);
+       r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
         if (r)
-               DMERR_LIMIT("dm_thin_remove_block() failed");
+               metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+       else if (m->maybe_shared)
+               r = passdown_double_checking_shared_status(m);
+       else
+               r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
  
-       process_prepared_discard_passdown(m);
+       /*
+        * Even if r is set, there could be sub discards in flight that we
+        * need to wait for.
+        */
+       m->bio->bi_error = r;
+       bio_endio(m->bio);
+       cell_defer_no_holder(tc, m->cell);
+       mempool_free(m, pool->mapping_pool);
  }
  
  static void process_prepared(struct pool *pool, struct list_head *head,
@@ -980,7 +1148,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
  }
  
  static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
-                                     dm_block_t data_block,
+                                     dm_block_t data_begin,
                                       struct dm_thin_new_mapping *m)
  {
         struct pool *pool = tc->pool;
@@ -990,7 +1158,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
         m->bio = bio;
         save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
         inc_all_io_entry(pool, bio);
-       remap_and_issue(tc, bio, data_block);
+       remap_and_issue(tc, bio, data_begin);
  }
  
  /*
@@ -1007,7 +1175,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
         struct dm_thin_new_mapping *m = get_next_mapping(pool);
  
         m->tc = tc;
-       m->virt_block = virt_block;
+       m->virt_begin = virt_block;
+       m->virt_end = virt_block + 1u;
         m->data_block = data_dest;
         m->cell = cell;
  
@@ -1086,7 +1255,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
  
         atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
         m->tc = tc;
-       m->virt_block = virt_block;
+       m->virt_begin = virt_block;
+       m->virt_end = virt_block + 1u;
         m->data_block = data_block;
         m->cell = cell;
  
@@ -1095,16 +1265,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
          * zeroing pre-existing data, we can issue the bio immediately.
          * Otherwise we use kcopyd to zero the data first.
          */
-       if (!pool->pf.zero_new_blocks)
+       if (pool->pf.zero_new_blocks) {
+               if (io_overwrites_block(pool, bio))
+                       remap_and_issue_overwrite(tc, bio, data_block, m);
+               else
+                       ll_zero(tc, m, data_block * pool->sectors_per_block,
+                               (data_block + 1) * pool->sectors_per_block);
+       } else
                 process_prepared_mapping(m);
-
-       else if (io_overwrites_block(pool, bio))
-               remap_and_issue_overwrite(tc, bio, data_block, m);
-
-       else
-               ll_zero(tc, m,
-                       data_block * pool->sectors_per_block,
-                       (data_block + 1) * pool->sectors_per_block);
  }
  
  static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1270,9 +1438,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
  {
         int error = should_error_unserviceable_bio(pool);
  
-       if (error)
-               bio_endio(bio, error);
-       else
+       if (error) {
+               bio->bi_error = error;
+               bio_endio(bio);
+       } else
                 retry_on_resume(bio);
  }
  
@@ -1295,99 +1464,148 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
                 retry_on_resume(bio);
  }
  
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+                                            struct dm_bio_prison_cell *virt_cell)
  {
-       int r;
-       struct bio *bio = cell->holder;
         struct pool *pool = tc->pool;
-       struct dm_bio_prison_cell *cell2;
-       struct dm_cell_key key2;
-       dm_block_t block = get_bio_block(tc, bio);
-       struct dm_thin_lookup_result lookup_result;
-       struct dm_thin_new_mapping *m;
+       struct dm_thin_new_mapping *m = get_next_mapping(pool);
  
-       if (tc->requeue_mode) {
-               cell_requeue(pool, cell);
-               return;
-       }
+       /*
+        * We don't need to lock the data blocks, since there's no
+        * passdown.  We only lock data blocks for allocation and breaking sharing.
+        */
+       m->tc = tc;
+       m->virt_begin = virt_cell->key.block_begin;
+       m->virt_end = virt_cell->key.block_end;
+       m->cell = virt_cell;
+       m->bio = virt_cell->holder;
  
-       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-       switch (r) {
-       case 0:
-               /*
-                * Check nobody is fiddling with this pool block.  This can
-                * happen if someone's in the process of breaking sharing
-                * on this block.
-                */
-               build_data_key(tc->td, lookup_result.block, &key2);
-               if (bio_detain(tc->pool, &key2, bio, &cell2)) {
-                       cell_defer_no_holder(tc, cell);
-                       break;
-               }
+       if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+               pool->process_prepared_discard(m);
+}
  
-               if (io_overlaps_block(pool, bio)) {
-                       /*
-                        * IO may still be going to the destination block.  We must
-                        * quiesce before we can do the removal.
-                        */
-                       m = get_next_mapping(pool);
-                       m->tc = tc;
-                       m->pass_discard = pool->pf.discard_passdown;
-                       m->definitely_not_shared = !lookup_result.shared;
-                       m->virt_block = block;
-                       m->data_block = lookup_result.block;
-                       m->cell = cell;
-                       m->cell2 = cell2;
-                       m->bio = bio;
-
-                       if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-                               pool->process_prepared_discard(m);
+/*
+ * __bio_inc_remaining() is used to defer parent bios's end_io until
+ * we _know_ all chained sub range discard bios have completed.
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+       bio->bi_flags |= (1 << BIO_CHAIN);
+       smp_mb__before_atomic();
+       atomic_inc(&bio->__bi_remaining);
+}
  
-               } else {
-                       inc_all_io_entry(pool, bio);
-                       cell_defer_no_holder(tc, cell);
-                       cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+                                struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+
+       int r;
+       bool maybe_shared;
+       struct dm_cell_key data_key;
+       struct dm_bio_prison_cell *data_cell;
+       struct dm_thin_new_mapping *m;
+       dm_block_t virt_begin, virt_end, data_begin;
+
+       while (begin != end) {
+               r = ensure_next_mapping(pool);
+               if (r)
+                       /* we did our best */
+                       return;
  
+               r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+                                             &data_begin, &maybe_shared);
+               if (r)
                         /*
-                        * The DM core makes sure that the discard doesn't span
-                        * a block boundary.  So we submit the discard of a
-                        * partial block appropriately.
+                        * Silently fail, letting any mappings we've
+                        * created complete.
                          */
-                       if ((!lookup_result.shared) && pool->pf.discard_passdown)
-                               remap_and_issue(tc, bio, lookup_result.block);
-                       else
-                               bio_endio(bio, 0);
+                       break;
+
+               build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+               if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+                       /* contention, we'll give up with this range */
+                       begin = virt_end;
+                       continue;
                 }
-               break;
  
-       case -ENODATA:
                 /*
-                * It isn't provisioned, just forget it.
+                * IO may still be going to the destination block.  We must
+                * quiesce before we can do the removal.
                  */
-               cell_defer_no_holder(tc, cell);
-               bio_endio(bio, 0);
-               break;
+               m = get_next_mapping(pool);
+               m->tc = tc;
+               m->maybe_shared = maybe_shared;
+               m->virt_begin = virt_begin;
+               m->virt_end = virt_end;
+               m->data_block = data_begin;
+               m->cell = data_cell;
+               m->bio = bio;
  
-       default:
-               DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-                           __func__, r);
-               cell_defer_no_holder(tc, cell);
-               bio_io_error(bio);
-               break;
+               /*
+                * The parent bio must not complete before sub discard bios are
+                * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+                *
+                * This per-mapping bi_remaining increment is paired with
+                * the implicit decrement that occurs via bio_endio() in
+                * process_prepared_discard_{passdown,no_passdown}.
+                */
+               __bio_inc_remaining(bio);
+               if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+                       pool->process_prepared_discard(m);
+
+               begin = virt_end;
         }
  }
  
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+       struct bio *bio = virt_cell->holder;
+       struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+       /*
+        * The virt_cell will only get freed once the origin bio completes.
+        * This means it will remain locked while all the individual
+        * passdown bios are in flight.
+        */
+       h->cell = virt_cell;
+       break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+       /*
+        * We complete the bio now, knowing that the bi_remaining field
+        * will prevent completion until the sub range discards have
+        * completed.
+        */
+       bio_endio(bio);
+}
+
  static void process_discard_bio(struct thin_c *tc, struct bio *bio)
  {
-       struct dm_bio_prison_cell *cell;
-       struct dm_cell_key key;
-       dm_block_t block = get_bio_block(tc, bio);
+       dm_block_t begin, end;
+       struct dm_cell_key virt_key;
+       struct dm_bio_prison_cell *virt_cell;
  
-       build_virtual_key(tc->td, block, &key);
-       if (bio_detain(tc->pool, &key, bio, &cell))
+       get_bio_block_range(tc, bio, &begin, &end);
+       if (begin == end) {
+               /*
+                * The discard covers less than a block.
+                */
+               bio_endio(bio);
+               return;
+       }
+
+       build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+       if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+               /*
+                * Potential starvation issue: We're relying on the
+                * fs/application being well behaved, and not trying to
+                * send IO to a region at the same time as discarding it.
+                * If they do this persistently then it's possible this
+                * cell will never be granted.
+                */
                 return;
  
-       process_discard_cell(tc, cell);
+       tc->pool->process_discard_cell(tc, virt_cell);
  }
  
  static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -1517,7 +1735,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
         if (bio_data_dir(bio) == READ) {
                 zero_fill_bio(bio);
                 cell_defer_no_holder(tc, cell);
-               bio_endio(bio, 0);
+               bio_endio(bio);
                 return;
         }
  
@@ -1582,7 +1800,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
  
                         } else {
                                 zero_fill_bio(bio);
-                               bio_endio(bio, 0);
+                               bio_endio(bio);
                         }
                 } else
                         provision_block(tc, bio, block, cell);
@@ -1653,7 +1871,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
                 }
  
                 zero_fill_bio(bio);
-               bio_endio(bio, 0);
+               bio_endio(bio);
                 break;
  
         default:
@@ -1678,7 +1896,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell
  
  static void process_bio_success(struct thin_c *tc, struct bio *bio)
  {
-       bio_endio(bio, 0);
+       bio_endio(bio);
  }
  
  static void process_bio_fail(struct thin_c *tc, struct bio *bio)
@@ -2014,18 +2232,23 @@ static void do_waker(struct work_struct *ws)
         queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
  }
  
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
  /*
   * We're holding onto IO to allow userland time to react.  After the
   * timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
   */
  static void do_no_space_timeout(struct work_struct *ws)
  {
         struct pool *pool = container_of(to_delayed_work(ws), struct pool,
                                          no_space_timeout);
  
-       if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
-               set_pool_mode(pool, PM_READ_ONLY);
+       if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+               pool->pf.error_if_no_space = true;
+               notify_of_pool_mode_change_to_oods(pool);
+               error_retry_list_with_code(pool, -ENOSPC);
+       }
  }
  
  /*----------------------------------------------------------------*/
@@ -2103,6 +2326,32 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
                dm_device_name(pool->pool_md), new_mode);
  }
  
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+       if (!pool->pf.error_if_no_space)
+               notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+       else
+               notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
+static bool passdown_enabled(struct pool_c *pt)
+{
+       return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+       struct pool_c *pt = pool->ti->private;
+
+       if (passdown_enabled(pt)) {
+               pool->process_discard_cell = process_discard_cell_passdown;
+               pool->process_prepared_discard = process_prepared_discard_passdown;
+       } else {
+               pool->process_discard_cell = process_discard_cell_no_passdown;
+               pool->process_prepared_discard = process_prepared_discard_no_passdown;
+       }
+}
+
  static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
  {
         struct pool_c *pt = pool->ti->private;
@@ -2154,7 +2403,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                 pool->process_cell = process_cell_read_only;
                 pool->process_discard_cell = process_cell_success;
                 pool->process_prepared_mapping = process_prepared_mapping_fail;
-               pool->process_prepared_discard = process_prepared_discard_passdown;
+               pool->process_prepared_discard = process_prepared_discard_success;
  
                 error_retry_list(pool);
                 break;
@@ -2169,13 +2418,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                  * frequently seeing this mode.
                  */
                 if (old_mode != new_mode)
-                       notify_of_pool_mode_change(pool, "out-of-data-space");
+                       notify_of_pool_mode_change_to_oods(pool);
                 pool->process_bio = process_bio_read_only;
                 pool->process_discard = process_discard_bio;
                 pool->process_cell = process_cell_read_only;
-               pool->process_discard_cell = process_discard_cell;
                 pool->process_prepared_mapping = process_prepared_mapping;
-               pool->process_prepared_discard = process_prepared_discard;
+               set_discard_callbacks(pool);
  
                 if (!pool->pf.error_if_no_space && no_space_timeout)
                         queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2184,13 +2432,13 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
         case PM_WRITE:
                 if (old_mode != new_mode)
                         notify_of_pool_mode_change(pool, "write");
+               pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
                 dm_pool_metadata_read_write(pool->pmd);
                 pool->process_bio = process_bio;
                 pool->process_discard = process_discard_bio;
                 pool->process_cell = process_cell;
-               pool->process_discard_cell = process_discard_cell;
                 pool->process_prepared_mapping = process_prepared_mapping;
-               pool->process_prepared_discard = process_prepared_discard;
+               set_discard_callbacks(pool);
                 break;
         }
  
@@ -2279,6 +2527,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
         h->shared_read_entry = NULL;
         h->all_io_entry = NULL;
         h->overwrite_mapping = NULL;
+       h->cell = NULL;
  }
  
  /*
@@ -2297,7 +2546,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
         thin_hook_bio(tc, bio);
  
         if (tc->requeue_mode) {
-               bio_endio(bio, DM_ENDIO_REQUEUE);
+               bio->bi_error = DM_ENDIO_REQUEUE;
+               bio_endio(bio);
                 return DM_MAPIO_SUBMITTED;
         }
  
@@ -2426,7 +2676,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
         struct pool *pool = pt->pool;
         struct block_device *data_bdev = pt->data_dev->bdev;
         struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
-       sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
         const char *reason = NULL;
         char buf[BDEVNAME_SIZE];
  
@@ -2439,12 +2688,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
         else if (data_limits->max_discard_sectors < pool->sectors_per_block)
                 reason = "max discard sectors smaller than a block";
  
-       else if (data_limits->discard_granularity > block_size)
-               reason = "discard granularity larger than a block";
-
-       else if (!is_factor(block_size, data_limits->discard_granularity))
-               reason = "discard granularity not a factor of block size";
-
         if (reason) {
                 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
                 pt->adjusted_pf.discard_passdown = false;
@@ -2959,7 +3202,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
                                                 metadata_low_callback,
                                                 pool);
         if (r)
-               goto out_free_pt;
+               goto out_flags_changed;
  
         pt->callbacks.congested_fn = pool_is_congested;
         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -3210,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti)
         struct pool_c *pt = ti->private;
         struct pool *pool = pt->pool;
  
-       cancel_delayed_work(&pool->waker);
-       cancel_delayed_work(&pool->no_space_timeout);
+       cancel_delayed_work_sync(&pool->waker);
+       cancel_delayed_work_sync(&pool->no_space_timeout);
         flush_workqueue(pool->wq);
         (void) commit(pool);
  }
@@ -3389,7 +3632,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
         if (get_pool_mode(pool) >= PM_READ_ONLY) {
                 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
                       dm_device_name(pool->pool_md));
-               return -EINVAL;
+               return -EOPNOTSUPP;
         }
  
         if (!strcasecmp(argv[0], "create_thin"))
@@ -3447,6 +3690,7 @@ static void emit_flags(struct pool_features *pf, char *result,
   * Status line is:
   *    <transaction id> <used metadata sectors>/<total metadata sectors>
   *    <used data sectors>/<total data sectors> <held metadata root>
+ *    <pool mode> <discard config> <no space config> <needs_check>
   */
  static void pool_status(struct dm_target *ti, status_type_t type,
                         unsigned status_flags, char *result, unsigned maxlen)
@@ -3548,6 +3792,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                 else
                         DMEMIT("queue_if_no_space ");
  
+               if (dm_pool_metadata_needs_check(pool->pmd))
+                       DMEMIT("needs_check ");
+               else
+                       DMEMIT("- ");
+
                 break;
  
         case STATUSTYPE_TABLE:
@@ -3573,38 +3822,6 @@ static int pool_iterate_devices(struct dm_target *ti,
         return fn(ti, pt->data_dev, 0, ti->len, data);
  }
  
-static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-                     struct bio_vec *biovec, int max_size)
-{
-       struct pool_c *pt = ti->private;
-       struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-
-       if (!q->merge_bvec_fn)
-               return max_size;
-
-       bvm->bi_bdev = pt->data_dev->bdev;
-
-       return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
-       struct pool *pool = pt->pool;
-       struct queue_limits *data_limits;
-
-       limits->max_discard_sectors = pool->sectors_per_block;
-
-       /*
-        * discard_granularity is just a hint, and not enforced.
-        */
-       if (pt->adjusted_pf.discard_passdown) {
-               data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-               limits->discard_granularity = max(data_limits->discard_granularity,
-                                                 pool->sectors_per_block << SECTOR_SHIFT);
-       } else
-               limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
  static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
  {
         struct pool_c *pt = ti->private;
@@ -3659,14 +3876,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
  
         disable_passdown_if_not_supported(pt);
  
-       set_discard_limits(pt, limits);
+       /*
+        * The pool uses the same discard limits as the underlying data
+        * device.  DM core has already set this up.
+        */
  }
  
  static struct target_type pool_target = {
         .name = "thin-pool",
         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                     DM_TARGET_IMMUTABLE,
-       .version = {1, 14, 0},
+       .version = {1, 16, 0},
         .module = THIS_MODULE,
         .ctr = pool_ctr,
         .dtr = pool_dtr,
@@ -3678,7 +3898,6 @@ static struct target_type pool_target = {
         .resume = pool_resume,
         .message = pool_message,
         .status = pool_status,
-       .merge = pool_merge,
         .iterate_devices = pool_iterate_devices,
         .io_hints = pool_io_hints,
  };
@@ -3825,8 +4044,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
         if (tc->pool->pf.discard_enabled) {
                 ti->discards_supported = true;
                 ti->num_discard_bios = 1;
-               /* Discard bios must be split on a block boundary */
-               ti->split_discard_bios = true;
+               ti->split_discard_bios = false;
         }
  
         mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3913,6 +4131,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                 }
         }
  
+       if (h->cell)
+               cell_defer_no_holder(h->tc, h->cell);
+
         return 0;
  }
  
@@ -4003,21 +4224,6 @@ err:
         DMEMIT("Error");
  }
  
-static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-                     struct bio_vec *biovec, int max_size)
-{
-       struct thin_c *tc = ti->private;
-       struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
-
-       if (!q->merge_bvec_fn)
-               return max_size;
-
-       bvm->bi_bdev = tc->pool_dev->bdev;
-       bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
-       return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
  static int thin_iterate_devices(struct dm_target *ti,
                                 iterate_devices_callout_fn fn, void *data)
  {
@@ -4040,9 +4246,21 @@ static int thin_iterate_devices(struct dm_target *ti,
         return 0;
  }
  
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+       struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
+
+       if (!pool->pf.discard_enabled)
+               return;
+
+       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+       limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
  static struct target_type thin_target = {
         .name = "thin",
-       .version = {1, 14, 0},
+       .version = {1, 16, 0},
         .module = THIS_MODULE,
         .ctr = thin_ctr,
         .dtr = thin_dtr,
@@ -4052,8 +4270,8 @@ static struct target_type thin_target = {
         .presuspend = thin_presuspend,
         .postsuspend = thin_postsuspend,
         .status = thin_status,
-       .merge = thin_merge,
         .iterate_devices = thin_iterate_devices,
+       .io_hints = thin_io_hints,
  };
  
  /*----------------------------------------------------------------*/