X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?p=kvmfornfv.git;a=blobdiff_plain;f=kernel%2Fdrivers%2Fmd%2Fraid5.c;h=adf72a9b6901020b03e48c0c28f89f11df0a3df0;hp=a90d5876fdadb21b4288315c1cdef02fdb993e58;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hpb=f93b97fd65072de626c074dbe099a1fff05ce060 diff --git a/kernel/drivers/md/raid5.c b/kernel/drivers/md/raid5.c index a90d5876f..adf72a9b6 100644 --- a/kernel/drivers/md/raid5.c +++ b/kernel/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi) { - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; + struct bio *bi; + while ((bi = bio_list_pop(return_bi)) != NULL) { bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); - bi = return_bi; + bio_endio(bi); } } @@ -344,7 +340,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -356,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf, struct list_head *list = &temp_inactive_list[size - 1]; /* - * We don't hold any lock here yet, get_active_stripe() might + * We don't hold any lock here yet, raid5_get_active_stripe() might * remove stripes from the list */ if (!list_empty_careful(list)) { @@ -365,15 +362,21 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(&conf->empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 << hash; spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup & (1 << i)) + wake_up(&conf->wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(&conf->wait_for_stripe); + if (atomic_read(&conf->active_stripes) == 0) + wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } @@ -410,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf, return count; } -static void release_stripe(struct stripe_head *sh) +void raid5_release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; @@ -655,9 +658,9 @@ static int has_failed(struct r5conf *conf) return 0; } -static struct stripe_head * -get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) +struct stripe_head * +raid5_get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); @@ -667,15 +670,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { sh = get_free_stripe(conf, hash); - if (!sh && llist_empty(&conf->released_stripes) && - !test_bit(R5_DID_ALLOC, &conf->cache_state)) + if (!sh && !test_bit(R5_DID_ALLOC, + &conf->cache_state)) set_bit(R5_ALLOC_MORE, &conf->cache_state); } @@ -684,14 +687,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - wait_event_lock_irq( - conf->wait_for_stripe, + wait_event_exclusive_cmd( + conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), - *(conf->hash_locks + hash)); + spin_unlock_irq(conf->hash_locks + hash), + spin_lock_irq(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); } else { @@ -716,6 +720,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf->inactive_list + hash)) + wake_up(&conf->wait_for_stripe[hash]); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -748,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) /* Only freshly new full stripe normal write stripe can be added to a batch list */ static bool stripe_can_batch(struct stripe_head *sh) { + struct r5conf *conf = sh->raid_conf; + + if (conf->log) + return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && is_full_stripe_write(sh); @@ -851,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh unlock_out: unlock_two_stripes(head, sh); out: - release_stripe(head); + raid5_release_stripe(head); } /* Determine if 'data_offset' or 'new_data_offset' should be used @@ -876,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) } static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi); static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { @@ -888,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); + if (r5l_write_stripe(conf->log, sh) == 0) + return; for (i = disks; i--; ) { int rw; int replace_only = 0; @@ -1166,7 +1179,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; + struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1190,20 +1203,18 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } + if (!raid5_dec_bi_active_stripes(rbi)) + bio_list_add(&return_bi, rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(return_bi); + return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_biofill(struct stripe_head *sh) @@ -1266,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref) if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } /* return a pointer to the address conversion region of the scribble buffer */ @@ -1692,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) } set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void @@ -1850,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref) sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) @@ -2014,7 +2025,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) /* we just created an active stripe so... */ atomic_inc(&conf->active_stripes); - release_stripe(sh); + raid5_release_stripe(sh); conf->max_nr_stripes++; return 1; } @@ -2183,7 +2194,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf->wait_for_stripe, + wait_event_exclusive_cmd(conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -2233,7 +2244,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!p) err = -ENOMEM; } - release_stripe(nsh); + raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ @@ -2268,17 +2279,15 @@ static void shrink_stripes(struct r5conf *conf) drop_one_stripe(conf)) ; - if (conf->slab_cache) - kmem_cache_destroy(conf->slab_cache); + kmem_cache_destroy(conf->slab_cache); conf->slab_cache = NULL; } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; sector_t s; @@ -2287,9 +2296,9 @@ static void raid5_end_read_request(struct bio * bi, int error) if (bi == &sh->dev[i].req) break; - pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", + pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; @@ -2308,7 +2317,7 @@ static void raid5_end_read_request(struct bio * bi, int error) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (uptodate) { + if (!bi->bi_error) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2393,16 +2402,15 @@ static void raid5_end_read_request(struct bio * bi, int error) rdev_dec_pending(rdev, conf->mddev); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; struct md_rdev *uninitialized_var(rdev); - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2425,23 +2433,23 @@ static void raid5_end_write_request(struct bio *bi, int error) break; } } - pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", + pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); + bi->bi_error); if (i == disks) { BUG(); return; } if (replacement) { - if (!uptodate) + if (bi->bi_error) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (!uptodate) { + if (bi->bi_error) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2462,20 +2470,18 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && !uptodate && !replacement) + if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) - release_stripe(sh->batch_head); + raid5_release_stripe(sh->batch_head); } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -2491,7 +2497,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) dev->rreq.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i, previous); + dev->sector = raid5_compute_blocknr(sh, i, previous); } static void error(struct mddev *mddev, struct md_rdev *rdev) @@ -2510,6 +2516,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -2523,9 +2530,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh) +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { sector_t stripe, stripe2; sector_t chunk_number; @@ -2725,7 +2732,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, return new_sector; } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) { struct r5conf *conf = sh->raid_conf; int raid_disks = sh->disks; @@ -3062,7 +3069,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, - struct bio **return_bi) + struct bio_list *return_bi) { int i; BUG_ON(sh->batch_head); @@ -3097,17 +3104,19 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; + r5l_stripe_write_finished(sh); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = nextbi; } @@ -3127,11 +3136,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); + + bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = bi2; } @@ -3140,6 +3149,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, * the data has not reached the cache yet. */ if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + s->failed > conf->max_degraded && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { spin_lock_irq(&sh->stripe_lock); @@ -3148,15 +3158,16 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); + if (bi) + s->to_read--; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_active_stripes(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } + + bi->bi_error = -EIO; + if (!raid5_dec_bi_active_stripes(bi)) + bio_list_add(return_bi, bi); bi = nextbi; } } @@ -3168,6 +3179,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, */ clear_bit(R5_LOCKED, &sh->dev[i].flags); } + s->to_write = 0; + s->written = 0; if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -3299,7 +3312,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (fdev[i]->towrite && !test_bit(R5_UPTODATE, &fdev[i]->flags) && !test_bit(R5_OVERWRITE, &fdev[i]->flags)) @@ -3323,7 +3336,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; - for (i = 0; i < s->failed; i++) { + for (i = 0; i < s->failed && i < 2; i++) { if (s->failed_num[i] != sh->pd_idx && s->failed_num[i] != sh->qd_idx && !test_bit(R5_UPTODATE, &fdev[i]->flags) && @@ -3435,7 +3448,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) + struct stripe_head *sh, int disks, struct bio_list *return_bi) { int i; struct r5dev *dev; @@ -3469,8 +3482,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; + bio_list_add(return_bi, wbi); } wbi = wbi2; } @@ -3494,8 +3506,12 @@ returnbi: WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); WARN_ON(dev->page != dev->orig_page); } + + r5l_stripe_write_finished(sh); + if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + int hash; clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); if (sh->qd_idx >= 0) { @@ -3509,16 +3525,17 @@ returnbi: * no updated data, so remove it from hash list and the stripe * will be reinitialized */ - spin_lock_irq(&conf->device_lock); unhash: + hash = sh->hash_lock_index; + spin_lock_irq(conf->hash_locks + hash); remove_hash(sh); + spin_unlock_irq(conf->hash_locks + hash); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, batch_list); if (sh != head_sh) goto unhash; } - spin_unlock_irq(&conf->device_lock); sh = head_sh; if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) @@ -3934,10 +3951,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) struct stripe_head *sh2; struct async_submit_ctl submit; - sector_t bn = compute_blocknr(sh, i, 1); + sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1, 1); + sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3947,7 +3964,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh2->state) || test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { /* must have already done this block */ - release_stripe(sh2); + raid5_release_stripe(sh2); continue; } @@ -3968,7 +3985,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) set_bit(STRIPE_EXPAND_READY, &sh2->state); set_bit(STRIPE_HANDLE, &sh2->state); } - release_stripe(sh2); + raid5_release_stripe(sh2); } /* done submitting copies, wait for them to complete */ @@ -4003,6 +4020,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->failed_num[0] = -1; s->failed_num[1] = -1; + s->log_failed = r5l_log_disk_error(conf); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -4057,8 +4075,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { - if (rdev) + if (rdev && !test_bit(Faulty, &rdev->flags)) set_bit(R5_NeedReplace, &dev->flags); + else + clear_bit(R5_NeedReplace, &dev->flags); rdev = rcu_dereference(conf->disks[i].rdev); clear_bit(R5_ReadRepl, &dev->flags); } @@ -4252,7 +4272,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, if (handle_flags == 0 || sh->state & handle_flags) set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } spin_lock_irq(&head_sh->stripe_lock); head_sh->batch_head = NULL; @@ -4313,6 +4333,9 @@ static void handle_stripe(struct stripe_head *sh) analyse_stripe(sh, &s); + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + goto finish; + if (s.handle_bad_blocks) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; @@ -4341,7 +4364,7 @@ static void handle_stripe(struct stripe_head *sh) /* check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. */ - if (s.failed > conf->max_degraded) { + if (s.failed > conf->max_degraded || s.log_failed) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -4499,7 +4522,7 @@ static void handle_stripe(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src - = get_active_stripe(conf, sh->sector, 1, 1, 1); + = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { /* sh cannot be written until sh_src has been read. * so arrange for sh to be delayed a little @@ -4509,11 +4532,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh_src->state)) atomic_inc(&conf->preread_active_stripes); - release_stripe(sh_src); + raid5_release_stripe(sh_src); goto finish; } if (sh_src) - release_stripe(sh_src); + raid5_release_stripe(sh_src); sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); @@ -4601,7 +4624,15 @@ finish: md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + if (!bio_list_empty(&s.return_bi)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->return_bi, &s.return_bi); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + return_io(&s.return_bi); + } clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4658,43 +4689,14 @@ static int raid5_congested(struct mddev *mddev, int bits) return 0; } -/* We want read requests to align with chunks where possible, - * but write requests don't need to. - */ -static int raid5_mergeable_bvec(struct mddev *mddev, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - - /* - * always allow writes to be mergeable, read as well if array - * is degraded as we'll go through stripe cache anyway. - */ - if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) - return biovec->bv_len; - - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; -} - static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { + struct r5conf *conf = mddev->private; sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -4745,13 +4747,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * first). * If the read failed.. */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi) { struct bio* raid_bi = bi->bi_private; struct mddev *mddev; struct r5conf *conf; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); struct md_rdev *rdev; + int error = bi->bi_error; bio_put(bi); @@ -4762,12 +4764,12 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); - if (!error && uptodate) { + if (!error) { trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), raid_bi, 0); - bio_endio(raid_bi, 0); + bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return; } @@ -4776,26 +4778,7 @@ static void raid5_align_endio(struct bio *bi, int error) add_bio_to_retry(raid_bi, conf); } -static int bio_fits_rdev(struct bio *bi) -{ - struct request_queue *q = bdev_get_queue(bi->bi_bdev); - - if (bio_sectors(bi) > queue_max_sectors(q)) - return 0; - blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_segments(q)) - return 0; - - if (q->merge_bvec_fn) - /* it's too hard to apply the merge_bvec_fn at this stage, - * just just give up - */ - return 0; - - return 1; -} - -static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) +static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; int dd_idx; @@ -4804,7 +4787,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { - pr_debug("chunk_aligned_read : non aligned\n"); + pr_debug("%s: non aligned\n", __func__); return 0; } /* @@ -4846,13 +4829,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; - __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); + bio_clear_flag(align_bi, BIO_SEG_VALID); - if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_iter.bi_sector, + if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), &first_bad, &bad_sectors)) { - /* too big in some way, or has a known bad block */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; @@ -4862,7 +4843,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, conf->device_lock); atomic_inc(&conf->active_aligned_reads); @@ -4881,6 +4862,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) } } +static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) +{ + struct bio *split; + + do { + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + + if (sectors < bio_sectors(raid_bio)) { + split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, raid_bio); + } else + split = raid_bio; + + if (!raid5_read_one_chunk(mddev, split)) { + if (split != raid_bio) + generic_make_request(raid_bio); + return split; + } + } while (split != raid_bio); + + return NULL; +} + /* __get_priority_stripe - get the next stripe to process * * Full stripe writes are allowed to pass preread active stripes up until @@ -5022,7 +5028,7 @@ static void release_stripe_plug(struct mddev *mddev, struct raid5_plug_cb *cb; if (!blk_cb) { - release_stripe(sh); + raid5_release_stripe(sh); return; } @@ -5038,7 +5044,7 @@ static void release_stripe_plug(struct mddev *mddev, if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); else - release_stripe(sh); + raid5_release_stripe(sh); } static void make_discard_request(struct mddev *mddev, struct bio *bi) @@ -5073,12 +5079,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) DEFINE_WAIT(w); int d; again: - sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5090,7 +5096,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) if (sh->dev[d].towrite || sh->dev[d].toread) { set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5129,7 +5135,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { md_write_end(mddev); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5146,8 +5152,15 @@ static void make_request(struct mddev *mddev, struct bio * bi) bool do_prepare; if (unlikely(bi->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bi); - return; + int ret = r5l_handle_flush_request(conf->log, bi); + + if (ret == 0) + return; + if (ret == -ENODEV) { + md_flush_request(mddev, bi); + return; + } + /* ret == -EAGAIN, fallback */ } md_write_start(mddev, bi); @@ -5158,9 +5171,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - mddev->reshape_position == MaxSector && - chunk_aligned_read(mddev,bi)) - return; + mddev->reshape_position == MaxSector) { + bi = chunk_aligned_read(mddev, bi); + if (!bi) + return; + } if (unlikely(bi->bi_rw & REQ_DISCARD)) { make_discard_request(mddev, bi); @@ -5218,7 +5233,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, previous, + sh = raid5_get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { @@ -5239,7 +5254,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) must_retry = 1; spin_unlock_irq(&conf->device_lock); if (must_retry) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5249,14 +5264,14 @@ static void make_request(struct mddev *mddev, struct bio * bi) /* Might have got the wrong stripe_head * by accident */ - release_stripe(sh); + raid5_release_stripe(sh); goto retry; } if (rw == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { - release_stripe(sh); + raid5_release_stripe(sh); /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -5279,7 +5294,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) * and wait a while */ md_wakeup_thread(mddev->thread); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5293,7 +5308,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); + bi->bi_error = -EIO; break; } } @@ -5307,7 +5322,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); - bio_endio(bi, 0); + bio_endio(bi); } } @@ -5336,6 +5351,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_t stripe_addr; int reshape_sectors; struct list_head stripes; + sector_t retn; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -5343,6 +5359,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; @@ -5351,7 +5371,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->curr_resync_completed = sector_nr; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; - return sector_nr; + retn = sector_nr; + goto finish; } } @@ -5359,10 +5380,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); /* We update the metadata at least every 10 seconds, or when * the data about to be copied would over-write the source of @@ -5377,11 +5396,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - writepos -= min_t(sector_t, reshape_sectors, writepos); + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ readpos -= min_t(sector_t, reshape_sectors, readpos); safepos -= min_t(sector_t, reshape_sectors, safepos); } @@ -5457,7 +5481,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped_disk = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -5470,7 +5494,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (conf->level == 6 && j == sh->qd_idx) continue; - s = compute_blocknr(sh, j, 0); + s = raid5_compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { skipped_disk = 1; continue; @@ -5506,10 +5530,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0, 1); + sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); first_sector += STRIPE_SECTORS; } /* Now that the sources are clearly marked, we can release @@ -5518,13 +5542,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk while (!list_empty(&stripes)) { sh = list_entry(stripes.next, struct stripe_head, lru); list_del_init(&sh->lru); - release_stripe(sh); + raid5_release_stripe(sh); } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -5549,7 +5576,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } ret: - return reshape_sectors; + return retn; } static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5611,11 +5638,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); - sh = get_active_stripe(conf, sector_nr, 0, 1, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -5639,7 +5666,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); return STRIPE_SECTORS; } @@ -5678,7 +5705,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 1); + sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5688,7 +5715,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - release_stripe(sh); + raid5_release_stripe(sh); raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; @@ -5696,17 +5723,17 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); handle_stripe(sh); - release_stripe(sh); + raid5_release_stripe(sh); handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); if (remaining == 0) { trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), raid_bio, 0); - bio_endio(raid_bio, 0); + bio_endio(raid_bio); } if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return handled; } @@ -5726,8 +5753,12 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) if (!list_empty(temp_inactive_list + i)) break; - if (i == NR_STRIPE_HASH_LOCKS) + if (i == NR_STRIPE_HASH_LOCKS) { + spin_unlock_irq(&conf->device_lock); + r5l_flush_stripe_to_raid(conf->log); + spin_lock_irq(&conf->device_lock); return batch_size; + } release_inactive = true; } spin_unlock_irq(&conf->device_lock); @@ -5735,6 +5766,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, release_inactive_stripe_list(conf, temp_inactive_list, NR_STRIPE_HASH_LOCKS); + r5l_flush_stripe_to_raid(conf->log); if (release_inactive) { spin_lock_irq(&conf->device_lock); return 0; @@ -5742,6 +5774,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); + r5l_write_stripe_run(conf->log); cond_resched(); @@ -5805,6 +5838,18 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); + if (!bio_list_empty(&conf->return_bi) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + struct bio_list tmp = BIO_EMPTY_LIST; + spin_lock_irq(&conf->device_lock); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + bio_list_merge(&tmp, &conf->return_bi); + bio_list_init(&conf->return_bi); + } + spin_unlock_irq(&conf->device_lock); + return_io(&tmp); + } + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -5863,6 +5908,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + r5l_flush_stripe_to_raid(conf->log); + async_tx_issue_pending_all(); blk_finish_plug(&plug); @@ -6245,8 +6292,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) /* size is defined by the smallest of previous and new size */ raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -6300,8 +6347,11 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + if (conf->log) + r5l_exit_log(conf->log); if (conf->shrinker.seeks) unregister_shrinker(&conf->shrinker); + free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -6456,12 +6506,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); mutex_init(&conf->cache_size_mutex); - init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_quiescent); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(&conf->wait_for_stripe[i]); + } init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); + bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -6511,7 +6565,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks - || raid_disk < 0) + || raid_disk < 0 || test_bit(Journal, &rdev->flags)) continue; disk = conf->disks + raid_disk; @@ -6551,6 +6605,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; } conf->min_nr_stripes = NR_STRIPES; @@ -6628,6 +6685,7 @@ static int run(struct mddev *mddev) int working_disks = 0; int dirty_parity_disks = 0; struct md_rdev *rdev; + struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; int i; long long min_offset_diff = 0; @@ -6640,6 +6698,11 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + + if (test_bit(Journal, &rdev->flags)) { + journal_dev = rdev; + continue; + } if (rdev->raid_disk < 0) continue; diff = (rdev->new_data_offset - rdev->data_offset); @@ -6670,6 +6733,14 @@ static int run(struct mddev *mddev) sector_t here_new, here_old; int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; + + if (journal_dev) { + printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); + return -EINVAL; + } if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6681,28 +6752,25 @@ static int run(struct mddev *mddev) /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. */ here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { printk(KERN_ERR "md/raid:%s: reshape_position not " "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } - reshape_offset = here_new * mddev->new_chunk_sectors; + reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: reshape position is" - " confused - aborting\n", mdname(mddev)); - return -EINVAL; - } /* We cannot be sure it is safe to start an in-place * reshape. It is only safe if user-space is monitoring * and taking constant backups. @@ -6721,10 +6789,10 @@ static int run(struct mddev *mddev) return -EINVAL; } } else if (mddev->reshape_backwards - ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors + (-min_offset_diff))) { + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -6749,6 +6817,13 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { + printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } + conf->min_offset_diff = min_offset_diff; mddev->thread = conf->thread; conf->thread = NULL; @@ -6952,6 +7027,14 @@ static int run(struct mddev *mddev) mddev->queue); } + if (journal_dev) { + char b[BDEVNAME_SIZE]; + + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(journal_dev->bdev, b)); + r5l_init_log(conf, journal_dev); + } + return 0; abort: md_unregister_thread(&mddev->thread); @@ -6976,7 +7059,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); + conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -7061,6 +7144,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); + if (test_bit(Journal, &rdev->flags)) { + /* + * journal disk is not removable, but we need give a chance to + * update superblock of other disks. Otherwise journal disk + * will be considered as 'fresh' + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return -EINVAL; + } if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -7123,6 +7215,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; + if (test_bit(Journal, &rdev->flags)) + return -EINVAL; if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -7182,7 +7276,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize; - sectors &= ~((sector_t)mddev->chunk_sectors - 1); + struct r5conf *conf = mddev->private; + + if (conf->log) + return -EINVAL; + sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && mddev->array_sectors > newsize) @@ -7233,6 +7331,8 @@ static int check_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; + if (conf->log) + return -EINVAL; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) @@ -7421,6 +7521,7 @@ static void end_reshape(struct r5conf *conf) rdev->data_offset = rdev->new_data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -7489,7 +7590,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7503,11 +7604,12 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; } + r5l_quiesce(conf->log, state); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) @@ -7766,7 +7868,6 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7790,7 +7891,6 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7815,7 +7915,6 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, - .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void)