X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=kernel%2Fblock%2Fblk-mq.c;h=2e21e4c049e2b4d5d067bc5bdcfbee34598c06fe;hb=52f993b8e89487ec9ee15a7fb4979e0f09a45b27;hp=c473bd192a4145f25fd148ed26376d02e3ce5701;hpb=fdb8b20906f3546ba6c2f9f0686d8a5189516ba3;p=kvmfornfv.git diff --git a/kernel/block/blk-mq.c b/kernel/block/blk-mq.c index c473bd192..2e21e4c04 100644 --- a/kernel/block/blk-mq.c +++ b/kernel/block/blk-mq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -77,49 +78,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) -{ - while (true) { - int ret; - - if (percpu_ref_tryget_live(&q->mq_usage_counter)) - return 0; - - if (!(gfp & __GFP_WAIT)) - return -EBUSY; - - ret = swait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); - if (blk_queue_dying(q)) - return -ENODEV; - if (ret) - return ret; - } -} - -static void blk_mq_queue_exit(struct request_queue *q) -{ - percpu_ref_put(&q->mq_usage_counter); -} - -static void blk_mq_usage_counter_release(struct percpu_ref *ref) -{ - struct request_queue *q = - container_of(ref, struct request_queue, mq_usage_counter); - - swait_wake_all(&q->mq_freeze_wq); -} - void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; - - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); + int freeze_depth; - if (freeze) { - percpu_ref_kill(&q->mq_usage_counter); + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { + percpu_ref_kill(&q->q_usage_counter); blk_mq_run_hw_queues(q, false); } } @@ -127,31 +92,45 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); static void blk_mq_freeze_queue_wait(struct request_queue *q) { - swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); + swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_freeze_queue(struct request_queue *q) { + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } + +void blk_mq_freeze_queue(struct request_queue *q) +{ + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); +} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { - percpu_ref_reinit(&q->mq_usage_counter); - swait_wake_all(&q->mq_freeze_wq); + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { + percpu_ref_reinit(&q->q_usage_counter); + swake_up_all(&q->mq_freeze_wq); } } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); @@ -170,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q) * dying, we need to ensure that processes currently waiting on * the queue are notified as well. */ - swait_wake_all(&q->mq_freeze_wq); + swake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) @@ -262,17 +241,17 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q, gfp); + ret = blk_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -285,7 +264,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, } blk_mq_put_ctx(ctx); if (!rq) { - blk_mq_queue_exit(q); + blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } return rq; @@ -304,7 +283,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, tag, &ctx->last_tag); - blk_mq_queue_exit(q); + blk_queue_exit(q); } void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -399,7 +378,7 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu_light(); } -void __blk_mq_complete_request(struct request *rq) +static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; @@ -417,14 +396,16 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { struct request_queue *q = rq->q; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -583,23 +564,9 @@ void blk_mq_abort_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_abort_requeue_list); -static inline bool is_flush_request(struct request *rq, - struct blk_flush_queue *fq, unsigned int tag) -{ - return ((rq->cmd_flags & REQ_FLUSH_SEQ) && - fq->flush_rq->tag == tag); -} - struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - struct request *rq = tags->rqs[tag]; - /* mq_ctx of flush rq is always cloned from the corresponding req */ - struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); - - if (!is_flush_request(rq, fq, tag)) - return rq; - - return fq->flush_rq; + return tags->rqs[tag]; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -656,7 +623,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, */ if (unlikely(blk_queue_dying(rq->q))) { rq->errors = -EIO; - blk_mq_complete_request(rq); + blk_mq_end_request(rq, rq->errors); } return; } @@ -679,24 +646,16 @@ static void blk_mq_rq_timer(unsigned long priv) .next = 0, .next_set = 0, }; - struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - /* - * If not software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; - - blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + struct blk_mq_hw_ctx *hctx; + queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -841,7 +800,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; - continue; + break; case BLK_MQ_RQ_QUEUE_BUSY: list_add(&rq->queuelist, &rq_list); __blk_mq_requeue_request(rq); @@ -903,7 +862,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int cpu = hctx->next_cpu, next_cpu; + int next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -911,8 +870,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; - - return cpu; } return hctx->next_cpu; @@ -1035,18 +992,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_queue); -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); +} + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1102,8 +1066,9 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); blk_mq_run_hw_queue(hctx, from_schedule); @@ -1185,7 +1150,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - if (!hctx_allow_merges(hctx)) { + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1222,11 +1187,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { - bio_endio(bio, -EIO); - return NULL; - } - + blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -1245,7 +1206,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; @@ -1257,28 +1218,79 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) { + *cookie = new_cookie; + return 0; + } + + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + *cookie = BLK_QC_T_NONE; + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + + return -1; +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to * hardware for SYNC IO. */ -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio_io_error(bio); + return BLK_QC_T_NONE; } + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1286,38 +1298,43 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * We do limited pluging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is + * empty + */ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; + list_del_init(&old_rq->queuelist); } - } + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) + goto done; + if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) || + blk_mq_direct_issue_request(old_rq, &cookie) != 0) + blk_mq_insert_request(old_rq, false, true, true); + goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1330,42 +1347,43 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); +done: + return cookie; } /* * Single hardware queue variant. This will attempt to use any per-process * plug for merging and IO deferral. */ -static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio_io_error(bio); + return BLK_QC_T_NONE; } - if (use_plug && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) - return; + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1378,21 +1396,21 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (!request_count) + trace_block_plug(q); - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + + list_add_tail(&rq->queuelist, &plug->mq_list); + return cookie; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1407,6 +1425,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); + return cookie; } /* @@ -1438,6 +1457,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); __free_pages(page, page->private); } @@ -1510,6 +1534,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, list_add_tail(&page->lru, &tags->page_list); p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, set->queue_depth - i); left -= to_do * rq_size; @@ -1528,7 +1557,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; fail: @@ -1682,7 +1710,7 @@ static int blk_mq_init_hctx(struct request_queue *q, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = hctx_idx; - hctx->flags = set->flags; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1790,13 +1818,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static void blk_mq_map_swqueue(struct request_queue *q) +static void blk_mq_map_swqueue(struct request_queue *q, + const struct cpumask *online_mask) { unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + /* + * Avoid others reading imcomplete hctx->cpumask through sysfs + */ + mutex_lock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -1807,7 +1841,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ - if (!cpu_online(i)) + if (!cpumask_test_cpu(i, online_mask)) continue; hctx = q->mq_ops->map_queue(q, i); @@ -1816,6 +1850,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->ctxs[hctx->nr_ctx++] = ctx; } + mutex_unlock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_ctxmap *map = &hctx->ctx_map; @@ -1851,29 +1887,36 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + queue_for_each_ctx(q, ctx, i) { + if (!cpumask_test_cpu(i, online_mask)) + continue; + + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->tags->cpumask); + } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - bool shared; int i; - if (set->tag_list.next == set->tag_list.prev) - shared = false; - else - shared = true; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } + queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } @@ -1884,7 +1927,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); - blk_mq_update_tag_set_depth(set); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); + } mutex_unlock(&set->tag_list_lock); } @@ -1894,8 +1942,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, q->tag_set = set; mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); - blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); } @@ -1918,6 +1975,9 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } + kfree(q->mq_map); + q->mq_map = NULL; + kfree(q->queue_hw_ctx); /* ctx kobj stays in queue_ctx */ @@ -1979,14 +2039,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - /* - * Init percpu_ref in atomic mode so that it's faster to shutdown. - * See blk_register_queue() for details. - */ - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_hctxs; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -2027,13 +2079,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (blk_mq_init_hw_queues(q, set)) goto err_hctxs; + get_online_cpus(); mutex_lock(&all_q_mutex); - list_add_tail(&q->all_q_node, &all_q_list); - mutex_unlock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q, cpu_online_mask); - blk_mq_map_swqueue(q); + mutex_unlock(&all_q_mutex); + put_online_cpus(); return q; @@ -2057,30 +2111,25 @@ void blk_mq_free_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); - - percpu_ref_exit(&q->mq_usage_counter); - - kfree(q->mq_map); - - q->mq_map = NULL; - - mutex_lock(&all_q_mutex); - list_del_init(&q->all_q_node); - mutex_unlock(&all_q_mutex); } /* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) +static void blk_mq_queue_reinit(struct request_queue *q, + const struct cpumask *online_mask) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe @@ -2088,7 +2137,7 @@ static void blk_mq_queue_reinit(struct request_queue *q) * involves free and re-allocate memory, worthy doing?) */ - blk_mq_map_swqueue(q); + blk_mq_map_swqueue(q, online_mask); blk_mq_sysfs_register(q); } @@ -2097,16 +2146,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, unsigned long action, void *hcpu) { struct request_queue *q; + int cpu = (unsigned long)hcpu; + /* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ + static struct cpumask online_new; /* - * Before new mappings are established, hotadded cpu might already - * start handling requests. This doesn't break anything as we map - * offline CPUs to first hardware queue. We will re-init the queue - * below to get optimal settings. + * Before hotadded cpu starts handling requests, new mappings must + * be established. Otherwise, these requests in hw queue might + * never be dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues + * (ctx0 for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into + * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is + * still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is + * set in pending bitmap and tries to retrieve requests in + * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, + * so the request in ctx1->rq_list is ignored. */ - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DEAD: + case CPU_UP_CANCELED: + cpumask_copy(&online_new, cpu_online_mask); + break; + case CPU_UP_PREPARE: + cpumask_copy(&online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &online_new); + break; + default: return NOTIFY_OK; + } mutex_lock(&all_q_mutex); @@ -2130,7 +2206,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, } list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q); + blk_mq_queue_reinit(q, &online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); @@ -2193,6 +2269,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the