DM RAID: Add support for MD RAID10
Neil,
I've changed the tunables to the way we discussed. If it becomes necessary to have the freedom to have simultaneous near and far copies, then I will likely add 'raid10_near|far|offset_copies' to compliment the existing 'raid10_copies' arg. Like you, I doubt they will be necessary though. I have yet to add the code that allows new devices to replace old/failed devices (i.e. handles the 'rebuild' parameter). That will be a future patch. brassow dm raid: add md raid10 support Support the MD RAID10 personality through dm-raid.c Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Index: linux-upstream/drivers/md/dm-raid.c ================================================== ================= --- linux-upstream.orig/drivers/md/dm-raid.c +++ linux-upstream/drivers/md/dm-raid.c @@ -11,6 +11,7 @@ #include "md.h" #include "raid1.h" #include "raid5.h" +#include "raid10.h" #include "bitmap.h" #include <linux/device-mapper.h> @@ -52,7 +53,10 @@ struct raid_dev { #define DMPF_MAX_RECOVERY_RATE 0x20 #define DMPF_MAX_WRITE_BEHIND 0x40 #define DMPF_STRIPE_CACHE 0x80 -#define DMPF_REGION_SIZE 0X100 +#define DMPF_REGION_SIZE 0x100 +#define DMPF_RAID10_COPIES 0x200 +#define DMPF_RAID10_FORMAT 0x400 + struct raid_set { struct dm_target *ti; @@ -76,6 +80,7 @@ static struct raid_type { const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, -1 /* Varies */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -86,6 +91,36 @@ static struct raid_type { {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} }; +static char *raid10_md_layout_to_format(int layout) +{ + if (layout & 0x10000) + return "offset"; + + if ((layout & 0xFF) > 1) + return "near"; + + return "far"; +} + +static unsigned raid10_md_layout_to_copies(int layout) +{ + if ((layout & 0xFF) > 1) + return layout & 0xFF; + return (layout >> 8) & 0xFF; +} + +static int raid10_format_to_md_layout(char *format, unsigned copies) +{ + unsigned n = 1, f = 1; + + if (!strcmp("near", format)) + n = copies; + else + f = copies; + + return (!strcmp("offset", format) << 16) | (f << 8) | n; +} + static struct raid_type *get_raid_type(char *name) { int i; @@ -339,10 +374,16 @@ static int validate_region_size(struct r * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [region_size <sectors>] Defines granularity of bitmap + * + * RAID10-only options: + * [raid10_copies <# copies>] Number of copies. (Default: 2) + * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { + char *raid10_format = "near"; + unsigned raid10_copies = 2; unsigned i, rebuild_cnt = 0; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; @@ -416,11 +457,30 @@ static int parse_raid_params(struct raid } key = argv[i++]; + + /* Parameters that take a string value are checked here. */ + if (!strcasecmp(key, "raid10_format")) { + if (rs->raid_type->level != 10) { + rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; + return -EINVAL; + } + if (strcmp("near", argv[i]) && + strcmp("far", argv[i]) && + strcmp("offset", argv[i])) { + rs->ti->error = "Invalid 'raid10_format' value given"; + return -EINVAL; + } + raid10_format = argv[i]; + rs->print_flags |= DMPF_RAID10_FORMAT; + continue; + } + if (strict_strtoul(argv[i], 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; } + /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { rebuild_cnt++; rs->ti->error = NULL; @@ -436,6 +496,7 @@ static int parse_raid_params(struct raid if (rebuild_cnt > rs->raid_type->parity_devs) rs->ti->error = "Too many rebuild devices specified for given RAID type"; break; + case 10: default: DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); rs->ti->error = "Rebuild not supported for this RAID type"; @@ -493,7 +554,7 @@ static int parse_raid_params(struct raid */ value /= 2; - if (rs->raid_type->level < 5) { + if (rs->raid_type->level != 5) { rs->ti->error = "Inappropriate argument: stripe_cache"; return -EINVAL; } @@ -518,6 +579,14 @@ static int parse_raid_params(struct raid } else if (!strcasecmp(key, "region_size")) { rs->print_flags |= DMPF_REGION_SIZE; region_size = value; + } else if (!strcasecmp(key, "raid10_copies") && + (rs->raid_type->level == 10)) { + if ((value < 2) || (value > 0xFF)) { + rs->ti->error = "Bad value for 'raid10_copies'"; + return -EINVAL; + } + rs->print_flags |= DMPF_RAID10_COPIES; + raid10_copies = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -536,9 +605,25 @@ static int parse_raid_params(struct raid if (dm_set_target_max_io_len(rs->ti, max_io_len)) return -EINVAL; - if ((rs->raid_type->level > 1) && - sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { + if (rs->raid_type->level == 10) { + /* (Len * Stripes) / Mirrors */ + sectors_per_dev *= rs->md.raid_disks; + if (sector_div(sectors_per_dev, raid10_copies)) { + rs->ti->error = "Target length not divisible by number of data devices"; + return -EINVAL; + } + if (raid10_copies > rs->md.raid_disks) { + rs->ti->error = "Not enough devices to satisfy specification"; + return -EINVAL; + } + rs->md.layout = raid10_format_to_md_layout(raid10_format, + raid10_copies); + rs->md.new_layout = rs->md.layout; + } else if ((rs->raid_type->level > 1) && + sector_div(sectors_per_dev, + (rs->md.raid_disks - rs->raid_type->parity_devs))) { rs->ti->error = "Target length not divisible by number of data devices"; + return -EINVAL; } rs->md.dev_sectors = sectors_per_dev; @@ -564,6 +649,9 @@ static int raid_is_congested(struct dm_t if (rs->raid_type->level == 1) return md_raid1_congested(&rs->md, bits); + if (rs->raid_type->level == 10) + return md_raid10_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } @@ -882,6 +970,9 @@ static int analyse_superblocks(struct dm case 6: redundancy = rs->raid_type->parity_devs; break; + case 10: + redundancy = raid10_md_layout_to_copies(mddev->layout) - 1; + break; default: ti->error = "Unknown RAID type"; return -EINVAL; @@ -1201,6 +1292,14 @@ static int raid_status(struct dm_target DMEMIT(" region_size %lu", rs->md.bitmap_info.chunksize >> 9); + if (rs->print_flags & DMPF_RAID10_COPIES) + DMEMIT(" raid10_copies %u", + raid10_md_layout_to_copies(rs->md.layout)); + + if (rs->print_flags & DMPF_RAID10_FORMAT) + DMEMIT(" raid10_format %s", + raid10_md_layout_to_format(rs->md.layout)); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { if (rs->dev[i].meta_dev) @@ -1275,7 +1374,7 @@ static void raid_resume(struct dm_target static struct target_type raid_target = { .name = "raid", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, @@ -1302,6 +1401,8 @@ module_init(dm_raid_init); module_exit(dm_raid_exit); MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid1"); +MODULE_ALIAS("dm-raid10"); MODULE_ALIAS("dm-raid4"); MODULE_ALIAS("dm-raid5"); MODULE_ALIAS("dm-raid6"); Index: linux-upstream/Documentation/device-mapper/dm-raid.txt ================================================== ================= --- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt +++ linux-upstream/Documentation/device-mapper/dm-raid.txt @@ -27,6 +27,11 @@ The target is named "raid" and it accept - rotating parity N (right-to-left) with data restart raid6_nc RAID6 N continue - rotating parity N (right-to-left) with data continuation + raid10 Various RAID10 inspired algorithms chosen by additional params + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') + - RAID1E: Integrated Adjacent Stripe Mirroring + - RAID1E: Integrated Offset Stripe Mirroring + - and other similar RAID10 variants Reference: Chapter 4 of http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf @@ -59,6 +64,59 @@ The target is named "raid" and it accept logical size of the array. The bitmap records the device synchronisation state for each region. + [raid10_copies <# copies>] + [raid10_format <near|far|offset>] + These two options are used to alter the default layout of + a RAID10 configuration. The number of copies is can be + specified, but the default is 2. There are also three + variations to how the copies are laid down - the default + is "near". Near copies are what most people think of with + respect to mirroring. If these options are left unspecified, + or 'raid10_copies 2' and/or 'raid10_format near' are given, + then the layouts for 2, 3 and 4 devices are: + 2 drives 3 drives 4 drives + -------- ---------- -------------- + A1 A1 A1 A1 A2 A1 A1 A2 A2 + A2 A2 A2 A3 A3 A3 A3 A4 A4 + A3 A3 A4 A4 A5 A5 A5 A6 A6 + A4 A4 A5 A6 A6 A7 A7 A8 A8 + .. .. .. .. .. .. .. .. .. + The 2-device layout is equivalent 2-way RAID1. The 4-device + layout is what a traditional RAID10 would look like. The + 3-device layout is what might be called a 'RAID1E - Integrated + Adjacent Stripe Mirroring'. + + If 'raid10_copies 2' and 'raid10_format far', then the layouts + for 2, 3 and 4 devices are: + 2 drives 3 drives 4 drives + -------- -------------- -------------------- + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + .. .. .. .. .. .. .. .. .. + A2 A1 A3 A1 A2 A4 A1 A2 A3 + A4 A3 A6 A4 A5 A8 A5 A6 A7 + A6 A5 A9 A7 A8 A12 A9 A10 A11 + .. .. .. .. .. .. .. .. .. + + If 'raid10_copies 2' and 'raid10_format offset', then the + layouts for 2, 3 and 4 devices are: + 2 drives 3 drives 4 drives + -------- ------------ ----------------- + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A2 A1 A3 A1 A2 A4 A1 A2 A3 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A4 A3 A6 A4 A5 A8 A5 A6 A7 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + A6 A5 A9 A7 A8 A12 A9 A10 A11 + .. .. .. .. .. .. .. .. .. + Here we see layouts closely akin to 'RAID1E - Integrated + Offset Stripe Mirroring'. + + Thanks wikipedia 'Non-standard RAID levels' for the layout + figures: + http://en.wikipedia.org/wiki/Non-standard_RAID_levels + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Wed, 11 Jul 2012 20:36:41 -0500 Jonathan Brassow <jbrassow@redhat.com>
wrote: > Neil, > > I've changed the tunables to the way we discussed. If it becomes > necessary to have the freedom to have simultaneous near and far copies, > then I will likely add 'raid10_near|far|offset_copies' to compliment the > existing 'raid10_copies' arg. Like you, I doubt they will be necessary > though. > > I have yet to add the code that allows new devices to replace old/failed > devices (i.e. handles the 'rebuild' parameter). That will be a future > patch. > > brassow Looks good, though a couple of comments below. Alasdair: I guess we should make sure we are in agreement about how patches to dm-raid.c are funnelled through. So far both you and I have feed them to Linus, which doesn't seem to have caused any problems yet. Are you OK with us continuing like that, would you rather all dm-raid.c patched went through you? I'm happy either way. > @@ -76,6 +80,7 @@ static struct raid_type { > const unsigned algorithm; /* RAID algorithm. */ > } raid_types[] = { > {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, > + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, -1 /* Varies */}, > {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, > {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, > {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, Initialising the "unsigned" algorithm to "-1" looks like it is asking for trouble. > @@ -493,7 +554,7 @@ static int parse_raid_params(struct raid > */ > value /= 2; > > - if (rs->raid_type->level < 5) { > + if (rs->raid_type->level != 5) { > rs->ti->error = "Inappropriate argument: stripe_cache"; > return -EINVAL; > } This leaves RAID6 out in the cold. Maybe level < 5 || level > 6 or !=5 !=6 or a switch statement? > @@ -536,9 +605,25 @@ static int parse_raid_params(struct raid > if (dm_set_target_max_io_len(rs->ti, max_io_len)) > return -EINVAL; > > - if ((rs->raid_type->level > 1) && > - sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { > + if (rs->raid_type->level == 10) { > + /* (Len * Stripes) / Mirrors */ > + sectors_per_dev *= rs->md.raid_disks; > + if (sector_div(sectors_per_dev, raid10_copies)) { > + rs->ti->error = "Target length not divisible by number of data devices"; > + return -EINVAL; > + } I'm not entirely sure what you are trying to do here, but I don't think it works. At the very least you would need to convert the "sectors_per_dev" number to a 'chunks_per_dev' before multiplying by raid_disks and dividing by copies. But even that isn't necessary. If you have a 3-device near=2 array with an odd number of chunks per device, that will still work. The last chunk won't be mirrored, so won't be used. Until a couple of weeks ago a recovery of the last device would trigger an error when we try to recover that last chunk, but that is fixed now. So if you want to impose this limitation (and there is some merit in making sure people don't confuse themselves), I suggest it get imposed in the user-space tools which create the RAID10. Otherwise it looks good. Thanks, NeilBrown -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Thu, Jul 12, 2012 at 04:32:07PM +1000, Neil Brown wrote:
> Alasdair: I guess we should make sure we are in agreement about how patches > to dm-raid.c are funnelled through. So far both you and I have feed them to > Linus, which doesn't seem to have caused any problems yet. Just take it case-by-case as has been happening. Roughly speaking, when changes are driven from the md side, you take the coupled patches to the dm files too; when the changes are driven from the dm side, I'll take them. Alasdair -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Thu, 12 Jul 2012 10:56:24 +0100 Alasdair G Kergon <agk@redhat.com> wrote:
> On Thu, Jul 12, 2012 at 04:32:07PM +1000, Neil Brown wrote: > > Alasdair: I guess we should make sure we are in agreement about how patches > > to dm-raid.c are funnelled through. So far both you and I have feed them to > > Linus, which doesn't seem to have caused any problems yet. > > Just take it case-by-case as has been happening. > Roughly speaking, when changes are driven from the md side, you take the > coupled patches to the dm files too; when the changes are driven from the dm > side, I'll take them. Sounds good - thanks. NeilBrown -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote:
> + [raid10_copies <# copies>] > + [raid10_format <near|far|offset>] > + These two options are used to alter the default layout of > + a RAID10 configuration. The number of copies is can be > + specified, but the default is 2. There are also three > + variations to how the copies are laid down - the default > + is "near". Near copies are what most people think of with > + respect to mirroring. If these options are left unspecified, > + or 'raid10_copies 2' and/or 'raid10_format near' are given, > + then the layouts for 2, 3 and 4 devices are: > + 2 drives 3 drives 4 drives > + -------- ---------- -------------- > + A1 A1 A1 A1 A2 A1 A1 A2 A2 > + A2 A2 A2 A3 A3 A3 A3 A4 A4 > + A3 A3 A4 A4 A5 A5 A5 A6 A6 > + A4 A4 A5 A6 A6 A7 A7 A8 A8 > + .. .. .. .. .. .. .. .. .. > + The 2-device layout is equivalent 2-way RAID1. The 4-device > + layout is what a traditional RAID10 would look like. The > + 3-device layout is what might be called a 'RAID1E - Integrated > + Adjacent Stripe Mirroring'. > + > + If 'raid10_copies 2' and 'raid10_format far', then the layouts > + for 2, 3 and 4 devices are: > + 2 drives 3 drives 4 drives > + -------- -------------- -------------------- > + A1 A2 A1 A2 A3 A1 A2 A3 A4 > + A3 A4 A4 A5 A6 A5 A6 A7 A8 > + A5 A6 A7 A8 A9 A9 A10 A11 A12 > + .. .. .. .. .. .. .. .. .. > + A2 A1 A3 A1 A2 A4 A1 A2 A3 > + A4 A3 A6 A4 A5 A8 A5 A6 A7 > + A6 A5 A9 A7 A8 A12 A9 A10 A11 The trick here for 4 drives is to keep the array running even if some 2 drives fail. Your layout does not so so. Only one drive may fail at any time. I think a better layout is (for 4 drives) A1 A2 A3 A4 A5 A6 A7 A8 ................. A2 A1 A4 A3 (Swich in pairs for N=2) A6 A5 A8 A7 Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should still be running.. 1+2 and 3+4 could not fail without destroying the array. This would give a 66,7 % chance of the array surviving 2 disk crashes. That is better than the 0 % that the documented scheme has. the same scheme could go for all even numbers of N in a raid10,far layout. consider the drives in pairs, and switch the blocks within a pair. I think this could be generalized to N-copies: treat every group N drives, as N copies of the same set of selection of blocks. Then any N-1 of the disks in the group could fail and the arry still be running. Works then for arrays with straight multipla of N disks . I am not sure that ordinary raid10 does so, but Neil has indicated so. I would be grateful if you could check this, and also test what happens with your code if you have any combination of 2 drives fail for the 4 drive case. > + > + If 'raid10_copies 2' and 'raid10_format offset', then the > + layouts for 2, 3 and 4 devices are: > + 2 drives 3 drives 4 drives > + -------- ------------ ----------------- > + A1 A2 A1 A2 A3 A1 A2 A3 A4 > + A2 A1 A3 A1 A2 A4 A1 A2 A3 > + A3 A4 A4 A5 A6 A5 A6 A7 A8 > + A4 A3 A6 A4 A5 A8 A5 A6 A7 > + A5 A6 A7 A8 A9 A9 A10 A11 A12 > + A6 A5 A9 A7 A8 A12 A9 A10 A11 The same problem here with 2 failing drives (for the 4 drive case). However I dont see an easy solution to this problem. > + Here we see layouts closely akin to 'RAID1E - Integrated > + Offset Stripe Mirroring'. > + > + Thanks wikipedia 'Non-standard RAID levels' for the layout > + figures: > + http://en.wikipedia.org/wiki/Non-standard_RAID_levels Wikipedia may be in error wrt. the block orders. besT regards Keld -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
Thanks for the suggestion. The documentation is correct, as far as I can tell. What you take issue with is that a higher level of redundancy can be achieved by laying down the copies differently. Neil touched on that in this message:
http://marc.info/?l=linux-raid&m=134136516029779&w=2 When it is available to MD, I'll make it available to dm-raid also. brassow On Jul 12, 2012, at 11:22 AM, keld@keldix.com wrote: > On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote: >> + [raid10_copies <# copies>] >> + [raid10_format <near|far|offset>] >> + These two options are used to alter the default layout of >> + a RAID10 configuration. The number of copies is can be >> + specified, but the default is 2. There are also three >> + variations to how the copies are laid down - the default >> + is "near". Near copies are what most people think of with >> + respect to mirroring. If these options are left unspecified, >> + or 'raid10_copies 2' and/or 'raid10_format near' are given, >> + then the layouts for 2, 3 and 4 devices are: >> + 2 drives 3 drives 4 drives >> + -------- ---------- -------------- >> + A1 A1 A1 A1 A2 A1 A1 A2 A2 >> + A2 A2 A2 A3 A3 A3 A3 A4 A4 >> + A3 A3 A4 A4 A5 A5 A5 A6 A6 >> + A4 A4 A5 A6 A6 A7 A7 A8 A8 >> + .. .. .. .. .. .. .. .. .. >> + The 2-device layout is equivalent 2-way RAID1. The 4-device >> + layout is what a traditional RAID10 would look like. The >> + 3-device layout is what might be called a 'RAID1E - Integrated >> + Adjacent Stripe Mirroring'. >> + >> + If 'raid10_copies 2' and 'raid10_format far', then the layouts >> + for 2, 3 and 4 devices are: >> + 2 drives 3 drives 4 drives >> + -------- -------------- -------------------- >> + A1 A2 A1 A2 A3 A1 A2 A3 A4 >> + A3 A4 A4 A5 A6 A5 A6 A7 A8 >> + A5 A6 A7 A8 A9 A9 A10 A11 A12 >> + .. .. .. .. .. .. .. .. .. >> + A2 A1 A3 A1 A2 A4 A1 A2 A3 >> + A4 A3 A6 A4 A5 A8 A5 A6 A7 >> + A6 A5 A9 A7 A8 A12 A9 A10 A11 > > The trick here for 4 drives is to keep the array running even if some 2 drives fail. > Your layout does not so so. Only one drive may fail at any time. > > I think a better layout is (for 4 drives) > > A1 A2 A3 A4 > A5 A6 A7 A8 > > ................. > > A2 A1 A4 A3 (Swich in pairs for N=2) > A6 A5 A8 A7 > > Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should > still be running.. 1+2 and 3+4 could not fail without destroying the array. > This would give a 66,7 % chance of the array surviving 2 disk crashes. > That is better than the 0 % that the documented scheme has. > > the same scheme could go for all even numbers of N in a raid10,far layout. > consider the drives in pairs, and switch the blocks within a pair. > > I think this could be generalized to N-copies: treat every group N drives, > as N copies of the same set of selection of blocks. > Then any N-1 of the disks in the group could fail and the arry still > be running. Works then for arrays with straight multipla of N disks . > > I am not sure that ordinary raid10 does so, but Neil has indicated so. > I would be grateful if you could check this, and > also test what happens with your code if you have any combination of 2 drives > fail for the 4 drive case. > >> + >> + If 'raid10_copies 2' and 'raid10_format offset', then the >> + layouts for 2, 3 and 4 devices are: >> + 2 drives 3 drives 4 drives >> + -------- ------------ ----------------- >> + A1 A2 A1 A2 A3 A1 A2 A3 A4 >> + A2 A1 A3 A1 A2 A4 A1 A2 A3 >> + A3 A4 A4 A5 A6 A5 A6 A7 A8 >> + A4 A3 A6 A4 A5 A8 A5 A6 A7 >> + A5 A6 A7 A8 A9 A9 A10 A11 A12 >> + A6 A5 A9 A7 A8 A12 A9 A10 A11 > > The same problem here with 2 failing drives (for the 4 drive case). > However I dont see an easy solution to this problem. > >> + Here we see layouts closely akin to 'RAID1E - Integrated >> + Offset Stripe Mirroring'. >> + >> + Thanks wikipedia 'Non-standard RAID levels' for the layout >> + figures: >> + http://en.wikipedia.org/wiki/Non-standard_RAID_levels > > Wikipedia may be in error wrt. the block orders. > > besT regards > Keld -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Thu, Jul 12, 2012 at 02:00:35PM -0500, Brassow Jonathan wrote:
> Thanks for the suggestion. The documentation is correct, as far as I can tell. What you take issue with is that a higher level of redundancy can be achieved by laying down the copies differently. Neil touched on that in this message: > http://marc.info/?l=linux-raid&m=134136516029779&w=2 Thanks for the info. Well, I corrected the wikipedia description to the one that I suggested, as this was more in line with what I understood was the current implementation. I have missed the email from Neil that you quoted above. I believe it was me writing up the Wikipedia text anyway, at least I did all of the initial writeup of the Wikipedia text on raid10. And then I saw that you were implementing a description on raid10,far that was less than optimal. That description should not be around as it is a flawed design. (I did the design of raid10,far). There should only be one layout for "far". I think when we discussed the "far" layout initially, we were not aware of the consequences of the layout then wrt how many disk failures the layout can survive.. I think the layout you described should not be promoted at all, and only kept for backward compatibility. As there is no backward compatibility in your case I think it is an error to implement it. I understand that you do not reuse any of the MD code here? I hestitate now changing the wikipedia description of MD raid10,far back. I fear that some implementers would code it as to that spec! Well, there should probably be something about it, I will write up something. The flaw is worse than Neil described, as far as I understand. With n=2 you can in the current implementation only have 1 disk failing, for any numbers of drives in the array. With the suggested layout then for 4 drives you have the probability of surviving 66 % of 2 drives failing. This get even better for 6, 8 .. disks in the array. And you may even survive 3 or more disk failures, dependent on the number of drives employed. The probability is the same as for raid-1+0 > When it is available to MD, I'll make it available to dm-raid also. Please dont implement it in the flawed way. It will just create a number of problems for when to switch over and convert between the two formats, and then which should be the default (I fear some would say the old flawed should be the default), and we need to explain the two formats and implement two sets of repairs and so on. Best regards Keld > brassow > > > On Jul 12, 2012, at 11:22 AM, keld@keldix.com wrote: > > > On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote: > >> + [raid10_copies <# copies>] > >> + [raid10_format <near|far|offset>] > >> + These two options are used to alter the default layout of > >> + a RAID10 configuration. The number of copies is can be > >> + specified, but the default is 2. There are also three > >> + variations to how the copies are laid down - the default > >> + is "near". Near copies are what most people think of with > >> + respect to mirroring. If these options are left unspecified, > >> + or 'raid10_copies 2' and/or 'raid10_format near' are given, > >> + then the layouts for 2, 3 and 4 devices are: > >> + 2 drives 3 drives 4 drives > >> + -------- ---------- -------------- > >> + A1 A1 A1 A1 A2 A1 A1 A2 A2 > >> + A2 A2 A2 A3 A3 A3 A3 A4 A4 > >> + A3 A3 A4 A4 A5 A5 A5 A6 A6 > >> + A4 A4 A5 A6 A6 A7 A7 A8 A8 > >> + .. .. .. .. .. .. .. .. .. > >> + The 2-device layout is equivalent 2-way RAID1. The 4-device > >> + layout is what a traditional RAID10 would look like. The > >> + 3-device layout is what might be called a 'RAID1E - Integrated > >> + Adjacent Stripe Mirroring'. > >> + > >> + If 'raid10_copies 2' and 'raid10_format far', then the layouts > >> + for 2, 3 and 4 devices are: > >> + 2 drives 3 drives 4 drives > >> + -------- -------------- -------------------- > >> + A1 A2 A1 A2 A3 A1 A2 A3 A4 > >> + A3 A4 A4 A5 A6 A5 A6 A7 A8 > >> + A5 A6 A7 A8 A9 A9 A10 A11 A12 > >> + .. .. .. .. .. .. .. .. .. > >> + A2 A1 A3 A1 A2 A4 A1 A2 A3 > >> + A4 A3 A6 A4 A5 A8 A5 A6 A7 > >> + A6 A5 A9 A7 A8 A12 A9 A10 A11 > > > > The trick here for 4 drives is to keep the array running even if some 2 drives fail. > > Your layout does not so so. Only one drive may fail at any time. > > > > I think a better layout is (for 4 drives) > > > > A1 A2 A3 A4 > > A5 A6 A7 A8 > > > > ................. > > > > A2 A1 A4 A3 (Swich in pairs for N=2) > > A6 A5 A8 A7 > > > > Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should > > still be running.. 1+2 and 3+4 could not fail without destroying the array. > > This would give a 66,7 % chance of the array surviving 2 disk crashes. > > That is better than the 0 % that the documented scheme has. > > > > the same scheme could go for all even numbers of N in a raid10,far layout. > > consider the drives in pairs, and switch the blocks within a pair. > > > > I think this could be generalized to N-copies: treat every group N drives, > > as N copies of the same set of selection of blocks. > > Then any N-1 of the disks in the group could fail and the arry still > > be running. Works then for arrays with straight multipla of N disks . > > > > I am not sure that ordinary raid10 does so, but Neil has indicated so. > > I would be grateful if you could check this, and > > also test what happens with your code if you have any combination of 2 drives > > fail for the 4 drive case. > > > >> + > >> + If 'raid10_copies 2' and 'raid10_format offset', then the > >> + layouts for 2, 3 and 4 devices are: > >> + 2 drives 3 drives 4 drives > >> + -------- ------------ ----------------- > >> + A1 A2 A1 A2 A3 A1 A2 A3 A4 > >> + A2 A1 A3 A1 A2 A4 A1 A2 A3 > >> + A3 A4 A4 A5 A6 A5 A6 A7 A8 > >> + A4 A3 A6 A4 A5 A8 A5 A6 A7 > >> + A5 A6 A7 A8 A9 A9 A10 A11 A12 > >> + A6 A5 A9 A7 A8 A12 A9 A10 A11 > > > > The same problem here with 2 failing drives (for the 4 drive case). > > However I dont see an easy solution to this problem. > > > >> + Here we see layouts closely akin to 'RAID1E - Integrated > >> + Offset Stripe Mirroring'. > >> + > >> + Thanks wikipedia 'Non-standard RAID levels' for the layout > >> + figures: > >> + http://en.wikipedia.org/wiki/Non-standard_RAID_levels > > > > Wikipedia may be in error wrt. the block orders. > > > > besT regards > > Keld > > -- > To unsubscribe from this list: send the line "unsubscribe linux-raid" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote:
> I think the layout you described should not be promoted at all, > and only kept for backward compatibility. As there is no backward > compatibility in your case I think it is an error to implement it. > I understand that you do not reuse any of the MD code here? Not correct. The whole point of this exercise is to reuse md code. > The flaw is worse than Neil described, as far as I understand. > With n=2 you can in the current implementation only have 1 disk failing, > for any numbers of drives in the array. With the suggested layout > then for 4 drives you have the probability of surviving 66 % > of 2 drives failing. This get even better for 6, 8 .. disks in the array. > And you may even survive 3 or more disk failures, dependent on the number > of drives employed. The probability is the same as for raid-1+0 Also not correct. You can certainly have more than one failed device providing you don't have 'n' adjacent devices all failed. So e.g. if you have 2 drives in a far-2 layout then you can survive the failure of three devices if they are 0,2,4 or 1,3,5. > > > When it is available to MD, I'll make it available to dm-raid also. > > Please dont implement it in the flawed way. It will just create a number of problems > for when to switch over and convert between the two formats, and then which should > be the default (I fear some would say the old flawed should be the default), and we need > to explain the two formats and implement two sets of repairs and so on. This "flawed" arrangement is the only one that makes sense for an odd number of devices (assuming 2 copies). NeilBrown -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Fri, Jul 13, 2012 at 11:27:17AM +1000, NeilBrown wrote:
> On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote: > > > I think the layout you described should not be promoted at all, > > and only kept for backward compatibility. As there is no backward > > compatibility in your case I think it is an error to implement it. > > I understand that you do not reuse any of the MD code here? > > Not correct. The whole point of this exercise is to reuse md code. OK, I also think it is only sensible to reuse the code already done. I misunderstood then your mail on not to repeat mistakes - which I took to mean that Barrow should not implement things with mistakes. Maybe that means to not make hooks to MD code that is a mistake? So Barrow will implement the improved far layout once there is MD code for it, and then he can make the neceessary hooks in DM code? > > The flaw is worse than Neil described, as far as I understand. > > With n=2 you can in the current implementation only have 1 disk failing, > > for any numbers of drives in the array. With the suggested layout > > then for 4 drives you have the probability of surviving 66 % > > of 2 drives failing. This get even better for 6, 8 .. disks in the array. > > And you may even survive 3 or more disk failures, dependent on the number > > of drives employed. The probability is the same as for raid-1+0 > > Also not correct. You can certainly have more than one failed device > providing you don't have 'n' adjacent devices all failed. > So e.g. if you have 2 drives in a far-2 layout then you can survive the > failure of three devices if they are 0,2,4 or 1,3,5. On further investigations I agree that you can survive more than one drive failing with the current layout. > > > When it is available to MD, I'll make it available to dm-raid also. > > > > Please dont implement it in the flawed way. It will just create a number of problems > > for when to switch over and convert between the two formats, and then which should > > be the default (I fear some would say the old flawed should be the default), and we need > > to explain the two formats and implement two sets of repairs and so on. > > This "flawed" arrangement is the only one that makes sense for an odd number > of devices (assuming 2 copies). Well, I have an idea for the odd number of devices: Have the disks arranged in groups (for N=2 in pairs) and then the last group extended with the leftover disks in the way it is done now. For 2 copies, this would be a number of pairs, and then a rest group of 3 disks. For 3 copies, this would be a number of triplets, and then 4 or 5 disks in the last group. Can I assume, Neil, that you agree with the rest I wrote? :-) Especially that we should only advice the new layout, and there is no reason for the current implementation except for backwards compatibility? best regards keld -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
DM RAID: Add support for MD RAID10
On Fri, 13 Jul 2012 10:29:23 +0200 keld@keldix.com wrote:
> On Fri, Jul 13, 2012 at 11:27:17AM +1000, NeilBrown wrote: > > On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote: > > > > > I think the layout you described should not be promoted at all, > > > and only kept for backward compatibility. As there is no backward > > > compatibility in your case I think it is an error to implement it. > > > I understand that you do not reuse any of the MD code here? > > > > Not correct. The whole point of this exercise is to reuse md code. > > OK, I also think it is only sensible to reuse the code already done. > I misunderstood then your mail on not to repeat mistakes - which I took to mean that > Barrow should not implement things with mistakes. Maybe that means to not make hooks > to MD code that is a mistake? > > So Barrow will implement the improved far layout once there is MD code for it, and > then he can make the neceessary hooks in DM code? > > > > The flaw is worse than Neil described, as far as I understand. > > > With n=2 you can in the current implementation only have 1 disk failing, > > > for any numbers of drives in the array. With the suggested layout > > > then for 4 drives you have the probability of surviving 66 % > > > of 2 drives failing. This get even better for 6, 8 .. disks in the array. > > > And you may even survive 3 or more disk failures, dependent on the number > > > of drives employed. The probability is the same as for raid-1+0 > > > > Also not correct. You can certainly have more than one failed device > > providing you don't have 'n' adjacent devices all failed. > > So e.g. if you have 2 drives in a far-2 layout then you can survive the > > failure of three devices if they are 0,2,4 or 1,3,5. > > On further investigations I agree that you can survive more than one drive failing with > the current layout. > > > > > When it is available to MD, I'll make it available to dm-raid also. > > > > > > Please dont implement it in the flawed way. It will just create a number of problems > > > for when to switch over and convert between the two formats, and then which should > > > be the default (I fear some would say the old flawed should be the default), and we need > > > to explain the two formats and implement two sets of repairs and so on. > > > > This "flawed" arrangement is the only one that makes sense for an odd number > > of devices (assuming 2 copies). > > Well, I have an idea for the odd number of devices: > Have the disks arranged in groups (for N=2 in pairs) and then the last group extended with > the leftover disks in the way it is done now. > > For 2 copies, this would be a number of pairs, and then a rest group of 3 disks. > For 3 copies, this would be a number of triplets, and then 4 or 5 disks in the last group. Certainly possible, but it feels clumsy. I'm not convinced it is a good idea. > > Can I assume, Neil, that you agree with the rest I wrote? :-) You can agree that I don't strongly disagree... > Especially that we should only advice the new layout, and there is no reason for the > current implementation except for backwards compatibility? The main reason for the current implementation is that is currently implemented. Until an alternate implementation exists, it seems pointless to recommend that people use it. Maybe you are suggesting that dmraid should not support raid10-far or raid10-offset until the "new" approach is implemented. Maybe that is sensible, but only if someone steps forwards and actually implements the "new" approach. NeilBrown -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel |
| All times are GMT. The time now is 06:55 AM. |
VBulletin, Copyright ©2000 - 2013, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.