* call has finished, the bio has been linked into some internal structure
* and so is visible to ->quiesce(), so we don't need the refcount any more.
*/
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
struct mddev *mddev = q->queuedata;
unsigned int sectors;
int cpu;
+ blk_queue_split(q, &bio, q->bio_split);
+
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
- bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
- return;
+ if (bio_sectors(bio) != 0)
+ bio->bi_error = -EROFS;
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
* go away inside make_request
*/
sectors = bio_sectors(bio);
+ /* bio could be mergeable after passing to underlayer */
+ bio->bi_rw &= ~REQ_NOMERGE;
mddev->pers->make_request(mddev, bio);
cpu = part_stat_lock();
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
+
+ return BLK_QC_T_NONE;
}
/* mddev_suspend makes sure no new requests are submitted
*/
void mddev_suspend(struct mddev *mddev)
{
- BUG_ON(mddev->suspended);
- mddev->suspended = 1;
+ if (mddev->suspended++)
+ return;
synchronize_rcu();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
void mddev_resume(struct mddev *mddev)
{
- mddev->suspended = 0;
+ if (--mddev->suspended)
+ return;
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
return mddev_congested(mddev, bits);
}
-static int md_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- int ret;
- rcu_read_lock();
- if (mddev->suspended) {
- /* Must always allow one vec */
- if (bvm->bi_size == 0)
- ret = biovec->bv_len;
- else
- ret = 0;
- } else {
- struct md_personality *pers = mddev->pers;
- if (pers && pers->mergeable_bvec)
- ret = pers->mergeable_bvec(mddev, bvm, biovec);
- else
- ret = biovec->bv_len;
- }
- rcu_read_unlock();
- return ret;
-}
/*
* Generic flush handling for md
*/
-static void md_end_flush(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
- bio_endio(bio, 0);
+ bio_endio(bio);
else {
bio->bi_rw &= ~REQ_FLUSH;
mddev->pers->make_request(mddev, bio);
bioset_free(bs);
}
+static void md_safemode_timeout(unsigned long data);
+
void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
- init_timer(&mddev->safemode_timer);
+ setup_timer(&mddev->safemode_timer, md_safemode_timeout,
+ (unsigned long) mddev);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
-static void super_written(struct bio *bio, int error)
+static void super_written(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
- if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
- printk("md: super_written gets error=%d, uptodate=%d\n",
- error, test_bit(BIO_UPTODATE, &bio->bi_flags));
- WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
+ if (bio->bi_error) {
+ printk("md: super_written gets error=%d\n", bio->bi_error);
md_error(mddev, rdev);
}
bio_add_page(bio, page, size, 0);
submit_bio_wait(rw, bio);
- ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ ret = !bio->bi_error;
bio_put(bio);
return ret;
}
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
- le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
- role = 0xffff;
+ role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
- case 0xffff: /* spare */
+ case MD_DISK_ROLE_SPARE: /* spare */
break;
- case 0xfffe: /* faulty */
+ case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
+ case MD_DISK_ROLE_JOURNAL: /* journal device */
+ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
+ /* journal device without journal feature */
+ printk(KERN_WARNING
+ "md: journal device provided without journal feature, ignoring the device\n");
+ return -EINVAL;
+ }
+ set_bit(Journal, &rdev->flags);
+ rdev->journal_tail = le64_to_cpu(sb->journal_tail);
+ if (mddev->recovery_cp == MaxSector)
+ set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ rdev->raid_disk = 0;
+ break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
+ set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
+ sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
- if (rdev->raid_disk >= 0 &&
+ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
+ /* Note: recovery_offset and journal_tail share space */
+ if (test_bit(Journal, &rdev->flags))
+ sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
}
}
+ if (mddev_is_clustered(mddev))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (test_bit(Journal, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
- sb->dev_roles[i] = cpu_to_le16(0xffff);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev1)
- rdev_for_each_rcu(rdev2, mddev2)
+ rdev_for_each_rcu(rdev, mddev1) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ test_bit(Journal, &rdev->flags) ||
+ rdev->raid_disk == -1)
+ continue;
+ rdev_for_each_rcu(rdev2, mddev2) {
+ if (test_bit(Faulty, &rdev2->flags) ||
+ test_bit(Journal, &rdev2->flags) ||
+ rdev2->raid_disk == -1)
+ continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
+ }
+ }
rcu_read_unlock();
return 0;
}
* All component devices are integrity capable and have matching
* profiles, register the common profile for the md device.
*/
- if (blk_integrity_register(mddev->gendisk,
- bdev_get_integrity(reference->bdev)) != 0) {
- printk(KERN_ERR "md: failed to register integrity for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
+ blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev));
+
printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
printk(KERN_ERR "md: failed to create integrity pool for %s\n",
}
EXPORT_SYMBOL(md_integrity_register);
-/* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+/*
+ * Attempt to add an rdev, but only if it is consistent with the current
+ * integrity profile
+ */
+int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
struct blk_integrity *bi_rdev;
struct blk_integrity *bi_mddev;
+ char name[BDEVNAME_SIZE];
if (!mddev->gendisk)
- return;
+ return 0;
bi_rdev = bdev_get_integrity(rdev->bdev);
bi_mddev = blk_get_integrity(mddev->gendisk);
if (!bi_mddev) /* nothing to do */
- return;
- if (rdev->raid_disk < 0) /* skip spares */
- return;
- if (bi_rdev && blk_integrity_compare(mddev->gendisk,
- rdev->bdev->bd_disk) >= 0)
- return;
- printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
- blk_integrity_unregister(mddev->gendisk);
+ return 0;
+
+ if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
+ printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n",
+ mdname(mddev), bdevname(rdev->bdev, name));
+ return -ENXIO;
+ }
+
+ return 0;
}
EXPORT_SYMBOL(md_integrity_add_rdev);
{
char b[BDEVNAME_SIZE];
struct kobject *ko;
- char *s;
int err;
/* prevent duplicates */
return -EBUSY;
}
bdevname(rdev->bdev,b);
- while ( (s=strchr(b, '/')) != NULL)
- *s = '!';
+ strreplace(b, '/', '!');
rdev->mddev = mddev;
printk(KERN_INFO "md: bind<%s>\n", b);
}
}
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct mdp_superblock_1 *sb;
+ int role;
+
+ /* Find a good rdev */
+ rdev_for_each(rdev, mddev)
+ if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ break;
+
+ /* No good device found. */
+ if (!rdev)
+ return false;
+
+ sb = page_address(rdev->sb_page);
+ /* Check if a device has become faulty or a spare become active */
+ rdev_for_each(rdev, mddev) {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ /* Device activated? */
+ if (role == 0xffff && rdev->raid_disk >=0 &&
+ !test_bit(Faulty, &rdev->flags))
+ return true;
+ /* Device turned faulty? */
+ if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ return true;
+ }
+
+ /* Check if any mddev parameters have changed */
+ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+ (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+ (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+ return true;
+
+ return false;
+}
+
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
+ int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
+
+ if (mddev_is_clustered(mddev)) {
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ ret = md_cluster_ops->metadata_update_start(mddev);
+ /* Has someone else has updated the sb */
+ if (!does_sb_need_changing(mddev)) {
+ if (ret == 0)
+ md_cluster_ops->metadata_update_cancel(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ return;
+ }
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
+
+ if (mddev_is_clustered(mddev) && ret == 0)
+ md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
+ if (test_bit(Journal, &flags)) {
+ len += sprintf(page+len, "%sjournal",sep);
+ sep = ",";
+ }
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
+ !test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->remove_disk(mddev, rdev);
- md_kick_rdev_from_array(rdev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
- if (mddev->pers)
- md_update_sb(mddev, 1);
- md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
err = 0;
+ if (mddev_is_clustered(mddev))
+ err = md_cluster_ops->remove_disk(mddev, rdev);
+
+ if (err == 0) {
+ md_kick_rdev_from_array(rdev);
+ if (mddev->pers)
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+ }
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
- } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
static ssize_t
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&rdev->corrected_errors, n);
- return len;
- }
- return -EINVAL;
+ unsigned int n;
+ int rv;
+
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
}
static struct rdev_sysfs_entry rdev_errors =
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(struct md_rdev *rdev, char *page)
{
- if (rdev->raid_disk < 0)
+ if (test_bit(Journal, &rdev->flags))
+ return sprintf(page, "journal\n");
+ else if (rdev->raid_disk < 0)
return sprintf(page, "none\n");
else
return sprintf(page, "%d\n", rdev->raid_disk);
static ssize_t
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
+ int slot;
int err;
- int slot = simple_strtoul(buf, &e, 10);
+
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strncmp(buf, "none", 4)==0)
slot = -1;
- else if (e==buf || (*e && *e!= '\n'))
- return -EINVAL;
+ else {
+ err = kstrtouint(buf, 10, (unsigned int *)&slot);
+ if (err < 0)
+ return err;
+ }
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
+ int err;
if (rdev->raid_disk != -1)
return -EBUSY;
sector_t oldsectors = rdev->sectors;
sector_t sectors;
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strict_blocks_to_sectors(buf, §ors) < 0)
return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
md_kick_rdev_from_array(rdev);
continue;
}
- /* No device should have a Candidate flag
- * when reading devices
- */
- if (test_bit(Candidate, &rdev->flags)) {
- pr_info("md: kicking Cluster Candidate %s from array!\n",
- bdevname(rdev->bdev, b));
- md_kick_rdev_from_array(rdev);
- }
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
- } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
+ } else if (rdev->raid_disk >=
+ (mddev->raid_disks - min(0, mddev->delta_disks)) &&
+ !test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
return 0;
}
-static void md_safemode_timeout(unsigned long data);
-
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
{
unsigned long msec;
+ if (mddev_is_clustered(mddev)) {
+ pr_info("md: Safemode is disabled for clustered mode\n");
+ return -EINVAL;
+ }
+
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec == 0)
static ssize_t
layout_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
int err;
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
return err;
static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
+ unsigned int n;
int err;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long n;
int err;
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtoul(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long long n;
int err;
- char *e;
- unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else {
+ err = kstrtoull(buf, 10, &n);
+ if (err < 0)
+ return err;
+ if (n != (sector_t)n)
+ return -EINVAL;
+ }
err = mddev_lock(mddev);
if (err)
return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
err = -EBUSY;
- else if (cmd_match(buf, "none"))
- n = MaxSector;
- else if (!*buf || (*e && *e != '\n'))
- err = -EINVAL;
if (!err) {
mddev->recovery_cp = n;
break;
case clean:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
break;
case active:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
static ssize_t
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
+ int rv;
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&mddev->max_corr_read_errors, n);
- return len;
- }
- return -EINVAL;
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
}
static struct md_sysfs_entry max_corr_read_errors =
if (err)
return err;
if (mddev->pers) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
type = "repair";
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
+ else if (mddev->reshape_position != MaxSector)
+ type = "reshape";
}
return sprintf(page, "%s\n", type);
}
}
mddev_unlock(mddev);
}
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ err = -EBUSY;
+ else {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = mddev->pers->start_reshape(mddev);
+ }
mddev_unlock(mddev);
}
if (err)
static ssize_t
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
{
- int min;
- char *e;
+ unsigned int min;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_min = 0;
- return len;
+ min = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &min);
+ if (rv < 0)
+ return rv;
+ if (min == 0)
+ return -EINVAL;
}
- min = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || min <= 0)
- return -EINVAL;
mddev->sync_speed_min = min;
return len;
}
static ssize_t
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
{
- int max;
- char *e;
+ unsigned int max;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_max = 0;
- return len;
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
}
- max = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || max <= 0)
- return -EINVAL;
mddev->sync_speed_max = max;
return len;
}
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
- char *e;
+ unsigned long long new;
int err;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = md_congested;
- blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
- mddev->safemode_timer.function = md_safemode_timeout;
- mddev->safemode_timer.data = (unsigned long) mddev;
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ if (mddev_is_clustered(mddev))
+ mddev->safemode_delay = 0;
+ else
+ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
+ if (mddev->degraded && !mddev->ro)
+ /* This ensures that recovering status is reported immediately
+ * via sysfs - until a lack of spares is confirmed.
+ */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
goto out;
}
+ if (mddev_is_clustered(mddev))
+ md_allow_write(mddev);
+
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ struct md_rdev *rdev;
+ bool has_journal = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags)) {
+ has_journal = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ /* Don't restart rw with journal missing/faulty */
+ if (!has_journal)
+ return -EINVAL;
+ }
+
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
mddev->degraded = 0;
mddev->safemode = 0;
mddev->private = NULL;
- mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.default_space = 0;
static void __md_stop_writes(struct mddev *mddev)
{
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
md_super_wait(mddev);
if (mddev->ro == 0 &&
- (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
+ ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
+ (mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */
- mddev->in_sync = 1;
+ if (!mddev_is_clustered(mddev))
+ mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
* which will now never happen */
wake_up_process(mddev->sync_thread->tsk);
+ if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags))
+ return -EBUSY;
mddev_unlock(mddev);
wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
&mddev->recovery));
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
__md_stop_writes(mddev);
__md_stop(mddev);
- mddev->queue->merge_bvec_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
/* tell userspace to handle 'inactive' */
if (mddev->hold_active == UNTIL_STOP)
mddev->hold_active = 0;
}
- blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0;
err = 0;
spin_lock(&mddev->lock);
- /* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap_info.file)
- file->pathname[0] = '\0';
- else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
- file->pathname, sizeof(file->pathname))),
- IS_ERR(ptr))
- err = PTR_ERR(ptr);
- else
- memmove(file->pathname, ptr,
- sizeof(file->pathname)-(ptr-file->pathname));
+ /* bitmap enabled */
+ if (mddev->bitmap_info.file) {
+ ptr = file_path(mddev->bitmap_info.file, file->pathname,
+ sizeof(file->pathname));
+ if (IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ }
spin_unlock(&mddev->lock);
if (err == 0 &&
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
+ if (test_bit(Journal, &rdev->flags))
+ info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
else
clear_bit(WriteMostly, &rdev->flags);
+ if (info->state & (1<<MD_DISK_JOURNAL))
+ set_bit(Journal, &rdev->flags);
/*
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
- if (info->state & (1 << MD_DISK_CANDIDATE)) {
- /* Through --cluster-confirm */
+ if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
- err = md_cluster_ops->new_disk_ack(mddev, true);
- if (err) {
- export_rdev(rdev);
- return err;
- }
- } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
- md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
+
if (err)
export_rdev(rdev);
- else
+
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE))
+ md_cluster_ops->new_disk_ack(mddev, (err == 0));
+ else {
+ if (err)
+ md_cluster_ops->add_new_disk_cancel(mddev);
+ else
+ err = add_bound_rdev(rdev);
+ }
+
+ } else if (!err)
err = add_bound_rdev(rdev);
- if (mddev_is_clustered(mddev) &&
- (info->state & (1 << MD_DISK_CLUSTER_ADD)))
- md_cluster_ops->add_new_disk_finish(mddev);
+
return err;
}
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
+ int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ ret = md_cluster_ops->metadata_update_start(mddev);
+
+ if (rdev->raid_disk < 0)
+ goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
if (rdev->raid_disk >= 0)
goto busy;
- if (mddev_is_clustered(mddev))
+kick_rdev:
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
-
return 0;
busy:
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
+
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
goto abort_export;
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
- goto abort_clustered;
+ goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
-
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
md_new_event(mddev);
return 0;
-abort_clustered:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
return rv;
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
}
}
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait,
!test_bit(MD_RECOVERY_NEEDED,
- &mddev->flags),
+ &mddev->recovery),
msecs_to_jiffies(5000));
if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
/* Need to flush page cache, and ensure no-one else opens
seq_printf(seq, "\n");
}
-static void status_resync(struct seq_file *seq, struct mddev *mddev)
+static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
unsigned long dt, db;
int scale;
unsigned int per_milli;
- if (mddev->curr_resync <= 3)
- resync = 0;
- else
- resync = mddev->curr_resync
- - atomic_read(&mddev->recovery_active);
-
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
+ resync = mddev->curr_resync;
+ if (resync <= 3) {
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* Still cleaning up */
+ resync = max_sectors;
+ } else
+ resync -= atomic_read(&mddev->recovery_active);
+
+ if (resync == 0) {
+ if (mddev->recovery_cp < MaxSector) {
+ seq_printf(seq, "\tresync=PENDING");
+ return 1;
+ }
+ return 0;
+ }
+ if (resync < 3) {
+ seq_printf(seq, "\tresync=DELAYED");
+ return 1;
+ }
+
WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a
((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+ return 1;
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
+ if (test_bit(Journal, &rdev->flags))
+ seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
mddev->pers->status(seq, mddev);
seq_printf(seq, "\n ");
if (mddev->pers->sync_request) {
- if (mddev->curr_resync > 2) {
- status_resync(seq, mddev);
+ if (status_resync(seq, mddev))
seq_printf(seq, "\n ");
- } else if (mddev->curr_resync >= 1)
- seq_printf(seq, "\tresync=DELAYED\n ");
- else if (mddev->recovery_cp < MaxSector)
- seq_printf(seq, "\tresync=PENDING\n ");
}
} else
seq_printf(seq, "\n ");
}
EXPORT_SYMBOL(unregister_md_personality);
-int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+int register_md_cluster_operations(struct md_cluster_operations *ops,
+ struct module *module)
{
- if (md_cluster_ops != NULL)
- return -EALREADY;
+ int ret = 0;
spin_lock(&pers_lock);
- md_cluster_ops = ops;
- md_cluster_mod = module;
+ if (md_cluster_ops != NULL)
+ ret = -EALREADY;
+ else {
+ md_cluster_ops = ops;
+ md_cluster_mod = module;
+ }
spin_unlock(&pers_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(register_md_cluster_operations);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
- int err;
-
- err = request_module("md-cluster");
- if (err) {
- pr_err("md-cluster module not found.\n");
- return err;
- }
-
+ if (!md_cluster_ops)
+ request_module("md-cluster");
spin_lock(&pers_lock);
+ /* ensure module won't be unloaded */
if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ pr_err("can't find md-cluster module or get it's reference.\n");
spin_unlock(&pers_lock);
return -ENOENT;
}
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
+ bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
md_new_event(mddev);
update_time = jiffies;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start(mddev, j, max_sectors);
-
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
> (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
- >= mddev->resync_max - mddev->curr_resync_completed
+ >= mddev->resync_max - mddev->curr_resync_completed ||
+ mddev->curr_resync_completed > mddev->resync_max
)) {
/* time to update curr_resync_completed */
wait_event(mddev->recovery_wait,
break;
j += sectors;
+ if (j > max_sectors)
+ /* when skipping, extra large numbers can be returned. */
+ j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
- /* tell personality that we are finished */
- mddev->pers->sync_request(mddev, max_sectors, &skipped);
-
- if (mddev_is_clustered(mddev))
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ mddev->curr_resync > 2) {
+ mddev->curr_resync_completed = mddev->curr_resync;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+ /* tell personality and other nodes that we are finished */
+ if (mddev_is_clustered(mddev)) {
md_cluster_ops->resync_finish(mddev);
+ cluster_resync_finished = true;
+ }
+ mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev_is_clustered(mddev) &&
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !cluster_resync_finished)
+ md_cluster_ops->resync_finish(mddev);
+
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
- set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
- ! test_bit(In_sync, &rdev->flags)) &&
+ (!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
- if (this)
+ if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
+ if (this && this != rdev)
+ continue;
+ if (test_bit(Candidate, &rdev->flags))
+ continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0)
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
+ if (test_bit(Journal, &rdev->flags))
+ continue;
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
continue;
- if (rdev->saved_raid_disk < 0)
- rdev->recovery_offset = 0;
+ rdev->recovery_offset = 0;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ int ret = 0;
+
+ if (mddev_is_clustered(mddev)) {
+ ret = md_cluster_ops->resync_start(mddev);
+ if (ret) {
+ mddev->sync_thread = NULL;
+ goto out;
+ }
+ }
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
+out:
if (!mddev->sync_thread) {
- printk(KERN_ERR "%s: could not start resync"
- " thread...\n",
- mdname(mddev));
+ if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
int spares = 0;
if (mddev->ro) {
+ struct md_rdev *rdev;
+ if (!mddev->external && mddev->in_sync)
+ /* 'Blocked' flag not needed as failed devices
+ * will be recorded if array switched to read/write.
+ * Leaving it set will prevent the device
+ * from being removed.
+ */
+ rdev_for_each(rdev, mddev)
+ clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
goto unlock;
}
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- if (mddev->flags & MD_UPDATE_SB_FLAGS) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
- }
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
return rv;
return ret;
}
-void md_reload_sb(struct mddev *mddev)
+static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *rdev, *tmp;
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ struct md_rdev *rdev2;
+ int role, ret;
+ char b[BDEVNAME_SIZE];
- rdev_for_each_safe(rdev, tmp, mddev) {
- rdev->sb_loaded = 0;
- ClearPageUptodate(rdev->sb_page);
+ /* Check for change of roles in the active devices */
+ rdev_for_each(rdev2, mddev) {
+ if (test_bit(Faulty, &rdev2->flags))
+ continue;
+
+ /* Check if the roles changed */
+ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+ if (test_bit(Candidate, &rdev2->flags)) {
+ if (role == 0xfffe) {
+ pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+ md_kick_rdev_from_array(rdev2);
+ continue;
+ }
+ else
+ clear_bit(Candidate, &rdev2->flags);
+ }
+
+ if (role != rdev2->raid_disk) {
+ /* got activated */
+ if (rdev2->raid_disk == -1 && role != 0xffff) {
+ rdev2->saved_raid_disk = role;
+ ret = remove_and_add_spares(mddev, rdev2);
+ pr_info("Activated spare: %s\n",
+ bdevname(rdev2->bdev,b));
+ continue;
+ }
+ /* device faulty
+ * We just want to do the minimum to mark the disk
+ * as faulty. The recovery is performed by the
+ * one who initiated the error.
+ */
+ if ((role == 0xfffe) || (role == 0xfffd)) {
+ md_error(mddev, rdev2);
+ clear_bit(Blocked, &rdev2->flags);
+ }
+ }
}
- mddev->raid_disks = 0;
- analyze_sbs(mddev);
- rdev_for_each_safe(rdev, tmp, mddev) {
- struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- /* since we don't write to faulty devices, we figure out if the
- * disk is faulty by comparing events
- */
- if (mddev->events > sb->events)
- set_bit(Faulty, &rdev->flags);
+
+ if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
+ update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+
+ /* Finally set the event to be up to date */
+ mddev->events = le64_to_cpu(sb->events);
+}
+
+static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int err;
+ struct page *swapout = rdev->sb_page;
+ struct mdp_superblock_1 *sb;
+
+ /* Store the sb page of the rdev in the swapout temporary
+ * variable in case we err in the future
+ */
+ rdev->sb_page = NULL;
+ alloc_disk_sb(rdev);
+ ClearPageUptodate(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
+
+ if (err < 0) {
+ pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
+ __func__, __LINE__, rdev->desc_nr, err);
+ put_page(rdev->sb_page);
+ rdev->sb_page = swapout;
+ rdev->sb_loaded = 1;
+ return err;
+ }
+
+ sb = page_address(rdev->sb_page);
+ /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
+ * is not set
+ */
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+
+ /* The other node finished recovery, call spare_active to set
+ * device In_sync and mddev->degraded
+ */
+ if (rdev->recovery_offset == MaxSector &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+ put_page(swapout);
+ return 0;
+}
+
+void md_reload_sb(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+ int err;
+
+ /* Find the rdev */
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->desc_nr == nr)
+ break;
}
+ if (!rdev || rdev->desc_nr != nr) {
+ pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
+ return;
+ }
+
+ err = read_rdev(mddev, rdev);
+ if (err < 0)
+ return;
+
+ check_sb_changes(mddev, rdev);
+
+ /* Read all rdev's to update recovery_offset */
+ rdev_for_each_rcu(rdev, mddev)
+ read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);
}
static int set_ro(const char *val, struct kernel_param *kp)
{
- char *e;
- int num = simple_strtoul(val, &e, 10);
- if (*val && (*e == '\0' || *e == '\n')) {
- start_readonly = num;
- return 0;
- }
- return -EINVAL;
+ return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);