FAQ Search Today's Posts Mark Forums Read
» Video Reviews

» Linux Archive

Linux-archive is a website aiming to archive linux email lists and to make them easily accessible for linux users/developers.


» Sponsor

» Partners

» Sponsor

Go Back   Linux Archive > Redhat > Device-mapper Development

 
 
LinkBack Thread Tools
 
Old 07-12-2012, 01:36 AM
Jonathan Brassow
 
Default DM RAID: Add support for MD RAID10

Neil,

I've changed the tunables to the way we discussed. If it becomes
necessary to have the freedom to have simultaneous near and far copies,
then I will likely add 'raid10_near|far|offset_copies' to compliment the
existing 'raid10_copies' arg. Like you, I doubt they will be necessary
though.

I have yet to add the code that allows new devices to replace old/failed
devices (i.e. handles the 'rebuild' parameter). That will be a future
patch.

brassow

dm raid: add md raid10 support

Support the MD RAID10 personality through dm-raid.c

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>

Index: linux-upstream/drivers/md/dm-raid.c
================================================== =================
--- linux-upstream.orig/drivers/md/dm-raid.c
+++ linux-upstream/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
#include "md.h"
#include "raid1.h"
#include "raid5.h"
+#include "raid10.h"
#include "bitmap.h"

#include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
#define DMPF_MAX_RECOVERY_RATE 0x20
#define DMPF_MAX_WRITE_BEHIND 0x40
#define DMPF_STRIPE_CACHE 0x80
-#define DMPF_REGION_SIZE 0X100
+#define DMPF_REGION_SIZE 0x100
+#define DMPF_RAID10_COPIES 0x200
+#define DMPF_RAID10_FORMAT 0x400
+
struct raid_set {
struct dm_target *ti;

@@ -76,6 +80,7 @@ static struct raid_type {
const unsigned algorithm; /* RAID algorithm. */
} raid_types[] = {
{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
+ {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, -1 /* Varies */},
{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
{"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
{"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,36 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};

+static char *raid10_md_layout_to_format(int layout)
+{
+ if (layout & 0x10000)
+ return "offset";
+
+ if ((layout & 0xFF) > 1)
+ return "near";
+
+ return "far";
+}
+
+static unsigned raid10_md_layout_to_copies(int layout)
+{
+ if ((layout & 0xFF) > 1)
+ return layout & 0xFF;
+ return (layout >> 8) & 0xFF;
+}
+
+static int raid10_format_to_md_layout(char *format, unsigned copies)
+{
+ unsigned n = 1, f = 1;
+
+ if (!strcmp("near", format))
+ n = copies;
+ else
+ f = copies;
+
+ return (!strcmp("offset", format) << 16) | (f << 8) | n;
+}
+
static struct raid_type *get_raid_type(char *name)
{
int i;
@@ -339,10 +374,16 @@ static int validate_region_size(struct r
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
* [region_size <sectors>] Defines granularity of bitmap
+ *
+ * RAID10-only options:
+ * [raid10_copies <# copies>] Number of copies. (Default: 2)
+ * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
{
+ char *raid10_format = "near";
+ unsigned raid10_copies = 2;
unsigned i, rebuild_cnt = 0;
unsigned long value, region_size = 0;
sector_t sectors_per_dev = rs->ti->len;
@@ -416,11 +457,30 @@ static int parse_raid_params(struct raid
}

key = argv[i++];
+
+ /* Parameters that take a string value are checked here. */
+ if (!strcasecmp(key, "raid10_format")) {
+ if (rs->raid_type->level != 10) {
+ rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
+ return -EINVAL;
+ }
+ if (strcmp("near", argv[i]) &&
+ strcmp("far", argv[i]) &&
+ strcmp("offset", argv[i])) {
+ rs->ti->error = "Invalid 'raid10_format' value given";
+ return -EINVAL;
+ }
+ raid10_format = argv[i];
+ rs->print_flags |= DMPF_RAID10_FORMAT;
+ continue;
+ }
+
if (strict_strtoul(argv[i], 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL;
}

+ /* Parameters that take a numeric value are checked here */
if (!strcasecmp(key, "rebuild")) {
rebuild_cnt++;
rs->ti->error = NULL;
@@ -436,6 +496,7 @@ static int parse_raid_params(struct raid
if (rebuild_cnt > rs->raid_type->parity_devs)
rs->ti->error = "Too many rebuild devices specified for given RAID type";
break;
+ case 10:
default:
DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
rs->ti->error = "Rebuild not supported for this RAID type";
@@ -493,7 +554,7 @@ static int parse_raid_params(struct raid
*/
value /= 2;

- if (rs->raid_type->level < 5) {
+ if (rs->raid_type->level != 5) {
rs->ti->error = "Inappropriate argument: stripe_cache";
return -EINVAL;
}
@@ -518,6 +579,14 @@ static int parse_raid_params(struct raid
} else if (!strcasecmp(key, "region_size")) {
rs->print_flags |= DMPF_REGION_SIZE;
region_size = value;
+ } else if (!strcasecmp(key, "raid10_copies") &&
+ (rs->raid_type->level == 10)) {
+ if ((value < 2) || (value > 0xFF)) {
+ rs->ti->error = "Bad value for 'raid10_copies'";
+ return -EINVAL;
+ }
+ rs->print_flags |= DMPF_RAID10_COPIES;
+ raid10_copies = value;
} else {
DMERR("Unable to parse RAID parameter: %s", key);
rs->ti->error = "Unable to parse RAID parameters";
@@ -536,9 +605,25 @@ static int parse_raid_params(struct raid
if (dm_set_target_max_io_len(rs->ti, max_io_len))
return -EINVAL;

- if ((rs->raid_type->level > 1) &&
- sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+ if (rs->raid_type->level == 10) {
+ /* (Len * Stripes) / Mirrors */
+ sectors_per_dev *= rs->md.raid_disks;
+ if (sector_div(sectors_per_dev, raid10_copies)) {
+ rs->ti->error = "Target length not divisible by number of data devices";
+ return -EINVAL;
+ }
+ if (raid10_copies > rs->md.raid_disks) {
+ rs->ti->error = "Not enough devices to satisfy specification";
+ return -EINVAL;
+ }
+ rs->md.layout = raid10_format_to_md_layout(raid10_format,
+ raid10_copies);
+ rs->md.new_layout = rs->md.layout;
+ } else if ((rs->raid_type->level > 1) &&
+ sector_div(sectors_per_dev,
+ (rs->md.raid_disks - rs->raid_type->parity_devs))) {
rs->ti->error = "Target length not divisible by number of data devices";
+
return -EINVAL;
}
rs->md.dev_sectors = sectors_per_dev;
@@ -564,6 +649,9 @@ static int raid_is_congested(struct dm_t
if (rs->raid_type->level == 1)
return md_raid1_congested(&rs->md, bits);

+ if (rs->raid_type->level == 10)
+ return md_raid10_congested(&rs->md, bits);
+
return md_raid5_congested(&rs->md, bits);
}

@@ -882,6 +970,9 @@ static int analyse_superblocks(struct dm
case 6:
redundancy = rs->raid_type->parity_devs;
break;
+ case 10:
+ redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
+ break;
default:
ti->error = "Unknown RAID type";
return -EINVAL;
@@ -1201,6 +1292,14 @@ static int raid_status(struct dm_target
DMEMIT(" region_size %lu",
rs->md.bitmap_info.chunksize >> 9);

+ if (rs->print_flags & DMPF_RAID10_COPIES)
+ DMEMIT(" raid10_copies %u",
+ raid10_md_layout_to_copies(rs->md.layout));
+
+ if (rs->print_flags & DMPF_RAID10_FORMAT)
+ DMEMIT(" raid10_format %s",
+ raid10_md_layout_to_format(rs->md.layout));
+
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
if (rs->dev[i].meta_dev)
@@ -1275,7 +1374,7 @@ static void raid_resume(struct dm_target

static struct target_type raid_target = {
.name = "raid",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
@@ -1302,6 +1401,8 @@ module_init(dm_raid_init);
module_exit(dm_raid_exit);

MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid1");
+MODULE_ALIAS("dm-raid10");
MODULE_ALIAS("dm-raid4");
MODULE_ALIAS("dm-raid5");
MODULE_ALIAS("dm-raid6");
Index: linux-upstream/Documentation/device-mapper/dm-raid.txt
================================================== =================
--- linux-upstream.orig/Documentation/device-mapper/dm-raid.txt
+++ linux-upstream/Documentation/device-mapper/dm-raid.txt
@@ -27,6 +27,11 @@ The target is named "raid" and it accept
- rotating parity N (right-to-left) with data restart
raid6_nc RAID6 N continue
- rotating parity N (right-to-left) with data continuation
+ raid10 Various RAID10 inspired algorithms chosen by additional params
+ - RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
+ - RAID1E: Integrated Adjacent Stripe Mirroring
+ - RAID1E: Integrated Offset Stripe Mirroring
+ - and other similar RAID10 variants

Reference: Chapter 4 of
http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
@@ -59,6 +64,59 @@ The target is named "raid" and it accept
logical size of the array. The bitmap records the device
synchronisation state for each region.

+ [raid10_copies <# copies>]
+ [raid10_format <near|far|offset>]
+ These two options are used to alter the default layout of
+ a RAID10 configuration. The number of copies is can be
+ specified, but the default is 2. There are also three
+ variations to how the copies are laid down - the default
+ is "near". Near copies are what most people think of with
+ respect to mirroring. If these options are left unspecified,
+ or 'raid10_copies 2' and/or 'raid10_format near' are given,
+ then the layouts for 2, 3 and 4 devices are:
+ 2 drives 3 drives 4 drives
+ -------- ---------- --------------
+ A1 A1 A1 A1 A2 A1 A1 A2 A2
+ A2 A2 A2 A3 A3 A3 A3 A4 A4
+ A3 A3 A4 A4 A5 A5 A5 A6 A6
+ A4 A4 A5 A6 A6 A7 A7 A8 A8
+ .. .. .. .. .. .. .. .. ..
+ The 2-device layout is equivalent 2-way RAID1. The 4-device
+ layout is what a traditional RAID10 would look like. The
+ 3-device layout is what might be called a 'RAID1E - Integrated
+ Adjacent Stripe Mirroring'.
+
+ If 'raid10_copies 2' and 'raid10_format far', then the layouts
+ for 2, 3 and 4 devices are:
+ 2 drives 3 drives 4 drives
+ -------- -------------- --------------------
+ A1 A2 A1 A2 A3 A1 A2 A3 A4
+ A3 A4 A4 A5 A6 A5 A6 A7 A8
+ A5 A6 A7 A8 A9 A9 A10 A11 A12
+ .. .. .. .. .. .. .. .. ..
+ A2 A1 A3 A1 A2 A4 A1 A2 A3
+ A4 A3 A6 A4 A5 A8 A5 A6 A7
+ A6 A5 A9 A7 A8 A12 A9 A10 A11
+ .. .. .. .. .. .. .. .. ..
+
+ If 'raid10_copies 2' and 'raid10_format offset', then the
+ layouts for 2, 3 and 4 devices are:
+ 2 drives 3 drives 4 drives
+ -------- ------------ -----------------
+ A1 A2 A1 A2 A3 A1 A2 A3 A4
+ A2 A1 A3 A1 A2 A4 A1 A2 A3
+ A3 A4 A4 A5 A6 A5 A6 A7 A8
+ A4 A3 A6 A4 A5 A8 A5 A6 A7
+ A5 A6 A7 A8 A9 A9 A10 A11 A12
+ A6 A5 A9 A7 A8 A12 A9 A10 A11
+ .. .. .. .. .. .. .. .. ..
+ Here we see layouts closely akin to 'RAID1E - Integrated
+ Offset Stripe Mirroring'.
+
+ Thanks wikipedia 'Non-standard RAID levels' for the layout
+ figures:
+ http://en.wikipedia.org/wiki/Non-standard_RAID_levels
+
<#raid_devs>: The number of devices composing the array.
Each device consists of two entries. The first is the device
containing the metadata (if any); the second is the one containing the


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-12-2012, 06:32 AM
NeilBrown
 
Default DM RAID: Add support for MD RAID10

On Wed, 11 Jul 2012 20:36:41 -0500 Jonathan Brassow <jbrassow@redhat.com>
wrote:

> Neil,
>
> I've changed the tunables to the way we discussed. If it becomes
> necessary to have the freedom to have simultaneous near and far copies,
> then I will likely add 'raid10_near|far|offset_copies' to compliment the
> existing 'raid10_copies' arg. Like you, I doubt they will be necessary
> though.
>
> I have yet to add the code that allows new devices to replace old/failed
> devices (i.e. handles the 'rebuild' parameter). That will be a future
> patch.
>
> brassow

Looks good, though a couple of comments below.

Alasdair: I guess we should make sure we are in agreement about how patches
to dm-raid.c are funnelled through. So far both you and I have feed them to
Linus, which doesn't seem to have caused any problems yet. Are you OK with
us continuing like that, would you rather all dm-raid.c patched went through
you?
I'm happy either way.

> @@ -76,6 +80,7 @@ static struct raid_type {
> const unsigned algorithm; /* RAID algorithm. */
> } raid_types[] = {
> {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
> + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, -1 /* Varies */},
> {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
> {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
> {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},

Initialising the "unsigned" algorithm to "-1" looks like it is asking for
trouble.

> @@ -493,7 +554,7 @@ static int parse_raid_params(struct raid
> */
> value /= 2;
>
> - if (rs->raid_type->level < 5) {
> + if (rs->raid_type->level != 5) {
> rs->ti->error = "Inappropriate argument: stripe_cache";
> return -EINVAL;
> }

This leaves RAID6 out in the cold. Maybe
level < 5 || level > 6
or !=5 !=6
or a switch statement?


> @@ -536,9 +605,25 @@ static int parse_raid_params(struct raid
> if (dm_set_target_max_io_len(rs->ti, max_io_len))
> return -EINVAL;
>
> - if ((rs->raid_type->level > 1) &&
> - sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) {
> + if (rs->raid_type->level == 10) {
> + /* (Len * Stripes) / Mirrors */
> + sectors_per_dev *= rs->md.raid_disks;
> + if (sector_div(sectors_per_dev, raid10_copies)) {
> + rs->ti->error = "Target length not divisible by number of data devices";
> + return -EINVAL;
> + }

I'm not entirely sure what you are trying to do here, but I don't think it
works.

At the very least you would need to convert the "sectors_per_dev" number to a
'chunks_per_dev' before multiplying by raid_disks and dividing by copies.

But even that isn't necessary. If you have a 3-device near=2 array with an
odd number of chunks per device, that will still work. The last chunk won't
be mirrored, so won't be used.
Until a couple of weeks ago a recovery of the last device would trigger an
error when we try to recover that last chunk, but that is fixed now.

So if you want to impose this limitation (and there is some merit in making
sure people don't confuse themselves), I suggest it get imposed in the
user-space tools which create the RAID10.


Otherwise it looks good.

Thanks,
NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-12-2012, 09:56 AM
Alasdair G Kergon
 
Default DM RAID: Add support for MD RAID10

On Thu, Jul 12, 2012 at 04:32:07PM +1000, Neil Brown wrote:
> Alasdair: I guess we should make sure we are in agreement about how patches
> to dm-raid.c are funnelled through. So far both you and I have feed them to
> Linus, which doesn't seem to have caused any problems yet.

Just take it case-by-case as has been happening.
Roughly speaking, when changes are driven from the md side, you take the
coupled patches to the dm files too; when the changes are driven from the dm
side, I'll take them.

Alasdair

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-12-2012, 11:43 AM
NeilBrown
 
Default DM RAID: Add support for MD RAID10

On Thu, 12 Jul 2012 10:56:24 +0100 Alasdair G Kergon <agk@redhat.com> wrote:

> On Thu, Jul 12, 2012 at 04:32:07PM +1000, Neil Brown wrote:
> > Alasdair: I guess we should make sure we are in agreement about how patches
> > to dm-raid.c are funnelled through. So far both you and I have feed them to
> > Linus, which doesn't seem to have caused any problems yet.
>
> Just take it case-by-case as has been happening.
> Roughly speaking, when changes are driven from the md side, you take the
> coupled patches to the dm files too; when the changes are driven from the dm
> side, I'll take them.

Sounds good - thanks.

NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-12-2012, 04:22 PM
 
Default DM RAID: Add support for MD RAID10

On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote:
> + [raid10_copies <# copies>]
> + [raid10_format <near|far|offset>]
> + These two options are used to alter the default layout of
> + a RAID10 configuration. The number of copies is can be
> + specified, but the default is 2. There are also three
> + variations to how the copies are laid down - the default
> + is "near". Near copies are what most people think of with
> + respect to mirroring. If these options are left unspecified,
> + or 'raid10_copies 2' and/or 'raid10_format near' are given,
> + then the layouts for 2, 3 and 4 devices are:
> + 2 drives 3 drives 4 drives
> + -------- ---------- --------------
> + A1 A1 A1 A1 A2 A1 A1 A2 A2
> + A2 A2 A2 A3 A3 A3 A3 A4 A4
> + A3 A3 A4 A4 A5 A5 A5 A6 A6
> + A4 A4 A5 A6 A6 A7 A7 A8 A8
> + .. .. .. .. .. .. .. .. ..
> + The 2-device layout is equivalent 2-way RAID1. The 4-device
> + layout is what a traditional RAID10 would look like. The
> + 3-device layout is what might be called a 'RAID1E - Integrated
> + Adjacent Stripe Mirroring'.
> +
> + If 'raid10_copies 2' and 'raid10_format far', then the layouts
> + for 2, 3 and 4 devices are:
> + 2 drives 3 drives 4 drives
> + -------- -------------- --------------------
> + A1 A2 A1 A2 A3 A1 A2 A3 A4
> + A3 A4 A4 A5 A6 A5 A6 A7 A8
> + A5 A6 A7 A8 A9 A9 A10 A11 A12
> + .. .. .. .. .. .. .. .. ..
> + A2 A1 A3 A1 A2 A4 A1 A2 A3
> + A4 A3 A6 A4 A5 A8 A5 A6 A7
> + A6 A5 A9 A7 A8 A12 A9 A10 A11

The trick here for 4 drives is to keep the array running even if some 2 drives fail.
Your layout does not so so. Only one drive may fail at any time.

I think a better layout is (for 4 drives)

A1 A2 A3 A4
A5 A6 A7 A8

.................

A2 A1 A4 A3 (Swich in pairs for N=2)
A6 A5 A8 A7

Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should
still be running.. 1+2 and 3+4 could not fail without destroying the array.
This would give a 66,7 % chance of the array surviving 2 disk crashes.
That is better than the 0 % that the documented scheme has.

the same scheme could go for all even numbers of N in a raid10,far layout.
consider the drives in pairs, and switch the blocks within a pair.

I think this could be generalized to N-copies: treat every group N drives,
as N copies of the same set of selection of blocks.
Then any N-1 of the disks in the group could fail and the arry still
be running. Works then for arrays with straight multipla of N disks .

I am not sure that ordinary raid10 does so, but Neil has indicated so.
I would be grateful if you could check this, and
also test what happens with your code if you have any combination of 2 drives
fail for the 4 drive case.

> +
> + If 'raid10_copies 2' and 'raid10_format offset', then the
> + layouts for 2, 3 and 4 devices are:
> + 2 drives 3 drives 4 drives
> + -------- ------------ -----------------
> + A1 A2 A1 A2 A3 A1 A2 A3 A4
> + A2 A1 A3 A1 A2 A4 A1 A2 A3
> + A3 A4 A4 A5 A6 A5 A6 A7 A8
> + A4 A3 A6 A4 A5 A8 A5 A6 A7
> + A5 A6 A7 A8 A9 A9 A10 A11 A12
> + A6 A5 A9 A7 A8 A12 A9 A10 A11

The same problem here with 2 failing drives (for the 4 drive case).
However I dont see an easy solution to this problem.

> + Here we see layouts closely akin to 'RAID1E - Integrated
> + Offset Stripe Mirroring'.
> +
> + Thanks wikipedia 'Non-standard RAID levels' for the layout
> + figures:
> + http://en.wikipedia.org/wiki/Non-standard_RAID_levels

Wikipedia may be in error wrt. the block orders.

besT regards
Keld

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-12-2012, 07:00 PM
Brassow Jonathan
 
Default DM RAID: Add support for MD RAID10

Thanks for the suggestion. The documentation is correct, as far as I can tell. What you take issue with is that a higher level of redundancy can be achieved by laying down the copies differently. Neil touched on that in this message:
http://marc.info/?l=linux-raid&m=134136516029779&w=2

When it is available to MD, I'll make it available to dm-raid also.

brassow


On Jul 12, 2012, at 11:22 AM, keld@keldix.com wrote:

> On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote:
>> + [raid10_copies <# copies>]
>> + [raid10_format <near|far|offset>]
>> + These two options are used to alter the default layout of
>> + a RAID10 configuration. The number of copies is can be
>> + specified, but the default is 2. There are also three
>> + variations to how the copies are laid down - the default
>> + is "near". Near copies are what most people think of with
>> + respect to mirroring. If these options are left unspecified,
>> + or 'raid10_copies 2' and/or 'raid10_format near' are given,
>> + then the layouts for 2, 3 and 4 devices are:
>> + 2 drives 3 drives 4 drives
>> + -------- ---------- --------------
>> + A1 A1 A1 A1 A2 A1 A1 A2 A2
>> + A2 A2 A2 A3 A3 A3 A3 A4 A4
>> + A3 A3 A4 A4 A5 A5 A5 A6 A6
>> + A4 A4 A5 A6 A6 A7 A7 A8 A8
>> + .. .. .. .. .. .. .. .. ..
>> + The 2-device layout is equivalent 2-way RAID1. The 4-device
>> + layout is what a traditional RAID10 would look like. The
>> + 3-device layout is what might be called a 'RAID1E - Integrated
>> + Adjacent Stripe Mirroring'.
>> +
>> + If 'raid10_copies 2' and 'raid10_format far', then the layouts
>> + for 2, 3 and 4 devices are:
>> + 2 drives 3 drives 4 drives
>> + -------- -------------- --------------------
>> + A1 A2 A1 A2 A3 A1 A2 A3 A4
>> + A3 A4 A4 A5 A6 A5 A6 A7 A8
>> + A5 A6 A7 A8 A9 A9 A10 A11 A12
>> + .. .. .. .. .. .. .. .. ..
>> + A2 A1 A3 A1 A2 A4 A1 A2 A3
>> + A4 A3 A6 A4 A5 A8 A5 A6 A7
>> + A6 A5 A9 A7 A8 A12 A9 A10 A11
>
> The trick here for 4 drives is to keep the array running even if some 2 drives fail.
> Your layout does not so so. Only one drive may fail at any time.
>
> I think a better layout is (for 4 drives)
>
> A1 A2 A3 A4
> A5 A6 A7 A8
>
> .................
>
> A2 A1 A4 A3 (Swich in pairs for N=2)
> A6 A5 A8 A7
>
> Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should
> still be running.. 1+2 and 3+4 could not fail without destroying the array.
> This would give a 66,7 % chance of the array surviving 2 disk crashes.
> That is better than the 0 % that the documented scheme has.
>
> the same scheme could go for all even numbers of N in a raid10,far layout.
> consider the drives in pairs, and switch the blocks within a pair.
>
> I think this could be generalized to N-copies: treat every group N drives,
> as N copies of the same set of selection of blocks.
> Then any N-1 of the disks in the group could fail and the arry still
> be running. Works then for arrays with straight multipla of N disks .
>
> I am not sure that ordinary raid10 does so, but Neil has indicated so.
> I would be grateful if you could check this, and
> also test what happens with your code if you have any combination of 2 drives
> fail for the 4 drive case.
>
>> +
>> + If 'raid10_copies 2' and 'raid10_format offset', then the
>> + layouts for 2, 3 and 4 devices are:
>> + 2 drives 3 drives 4 drives
>> + -------- ------------ -----------------
>> + A1 A2 A1 A2 A3 A1 A2 A3 A4
>> + A2 A1 A3 A1 A2 A4 A1 A2 A3
>> + A3 A4 A4 A5 A6 A5 A6 A7 A8
>> + A4 A3 A6 A4 A5 A8 A5 A6 A7
>> + A5 A6 A7 A8 A9 A9 A10 A11 A12
>> + A6 A5 A9 A7 A8 A12 A9 A10 A11
>
> The same problem here with 2 failing drives (for the 4 drive case).
> However I dont see an easy solution to this problem.
>
>> + Here we see layouts closely akin to 'RAID1E - Integrated
>> + Offset Stripe Mirroring'.
>> +
>> + Thanks wikipedia 'Non-standard RAID levels' for the layout
>> + figures:
>> + http://en.wikipedia.org/wiki/Non-standard_RAID_levels
>
> Wikipedia may be in error wrt. the block orders.
>
> besT regards
> Keld


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-13-2012, 01:15 AM
 
Default DM RAID: Add support for MD RAID10

On Thu, Jul 12, 2012 at 02:00:35PM -0500, Brassow Jonathan wrote:
> Thanks for the suggestion. The documentation is correct, as far as I can tell. What you take issue with is that a higher level of redundancy can be achieved by laying down the copies differently. Neil touched on that in this message:
> http://marc.info/?l=linux-raid&m=134136516029779&w=2

Thanks for the info. Well, I corrected the wikipedia description to the one that I
suggested, as this was more in line with what I understood was the current implementation.
I have missed the email from Neil that you quoted above.
I believe it was me writing up the Wikipedia text anyway, at least I did all of
the initial writeup of the Wikipedia text on raid10.

And then I saw that you were implementing a description on raid10,far that
was less than optimal. That description should not be around as it is a flawed design.
(I did the design of raid10,far).
There should only be one layout for "far". I think when we discussed
the "far" layout initially, we were not aware of the consequences of the
layout then wrt how many disk failures the layout can survive..

I think the layout you described should not be promoted at all,
and only kept for backward compatibility. As there is no backward
compatibility in your case I think it is an error to implement it.
I understand that you do not reuse any of the MD code here?

I hestitate now changing the wikipedia description of MD raid10,far back.
I fear that some implementers would code it as to that spec!
Well, there should probably be something about it, I will write
up something.

The flaw is worse than Neil described, as far as I understand.
With n=2 you can in the current implementation only have 1 disk failing,
for any numbers of drives in the array. With the suggested layout
then for 4 drives you have the probability of surviving 66 %
of 2 drives failing. This get even better for 6, 8 .. disks in the array.
And you may even survive 3 or more disk failures, dependent on the number
of drives employed. The probability is the same as for raid-1+0

> When it is available to MD, I'll make it available to dm-raid also.

Please dont implement it in the flawed way. It will just create a number of problems
for when to switch over and convert between the two formats, and then which should
be the default (I fear some would say the old flawed should be the default), and we need
to explain the two formats and implement two sets of repairs and so on.

Best regards
Keld

> brassow
>
>
> On Jul 12, 2012, at 11:22 AM, keld@keldix.com wrote:
>
> > On Wed, Jul 11, 2012 at 08:36:41PM -0500, Jonathan Brassow wrote:
> >> + [raid10_copies <# copies>]
> >> + [raid10_format <near|far|offset>]
> >> + These two options are used to alter the default layout of
> >> + a RAID10 configuration. The number of copies is can be
> >> + specified, but the default is 2. There are also three
> >> + variations to how the copies are laid down - the default
> >> + is "near". Near copies are what most people think of with
> >> + respect to mirroring. If these options are left unspecified,
> >> + or 'raid10_copies 2' and/or 'raid10_format near' are given,
> >> + then the layouts for 2, 3 and 4 devices are:
> >> + 2 drives 3 drives 4 drives
> >> + -------- ---------- --------------
> >> + A1 A1 A1 A1 A2 A1 A1 A2 A2
> >> + A2 A2 A2 A3 A3 A3 A3 A4 A4
> >> + A3 A3 A4 A4 A5 A5 A5 A6 A6
> >> + A4 A4 A5 A6 A6 A7 A7 A8 A8
> >> + .. .. .. .. .. .. .. .. ..
> >> + The 2-device layout is equivalent 2-way RAID1. The 4-device
> >> + layout is what a traditional RAID10 would look like. The
> >> + 3-device layout is what might be called a 'RAID1E - Integrated
> >> + Adjacent Stripe Mirroring'.
> >> +
> >> + If 'raid10_copies 2' and 'raid10_format far', then the layouts
> >> + for 2, 3 and 4 devices are:
> >> + 2 drives 3 drives 4 drives
> >> + -------- -------------- --------------------
> >> + A1 A2 A1 A2 A3 A1 A2 A3 A4
> >> + A3 A4 A4 A5 A6 A5 A6 A7 A8
> >> + A5 A6 A7 A8 A9 A9 A10 A11 A12
> >> + .. .. .. .. .. .. .. .. ..
> >> + A2 A1 A3 A1 A2 A4 A1 A2 A3
> >> + A4 A3 A6 A4 A5 A8 A5 A6 A7
> >> + A6 A5 A9 A7 A8 A12 A9 A10 A11
> >
> > The trick here for 4 drives is to keep the array running even if some 2 drives fail.
> > Your layout does not so so. Only one drive may fail at any time.
> >
> > I think a better layout is (for 4 drives)
> >
> > A1 A2 A3 A4
> > A5 A6 A7 A8
> >
> > .................
> >
> > A2 A1 A4 A3 (Swich in pairs for N=2)
> > A6 A5 A8 A7
> >
> > Here all of the drive combinations 1+3, 1+4, 2+3, 2+4 may fail, and the array should
> > still be running.. 1+2 and 3+4 could not fail without destroying the array.
> > This would give a 66,7 % chance of the array surviving 2 disk crashes.
> > That is better than the 0 % that the documented scheme has.
> >
> > the same scheme could go for all even numbers of N in a raid10,far layout.
> > consider the drives in pairs, and switch the blocks within a pair.
> >
> > I think this could be generalized to N-copies: treat every group N drives,
> > as N copies of the same set of selection of blocks.
> > Then any N-1 of the disks in the group could fail and the arry still
> > be running. Works then for arrays with straight multipla of N disks .
> >
> > I am not sure that ordinary raid10 does so, but Neil has indicated so.
> > I would be grateful if you could check this, and
> > also test what happens with your code if you have any combination of 2 drives
> > fail for the 4 drive case.
> >
> >> +
> >> + If 'raid10_copies 2' and 'raid10_format offset', then the
> >> + layouts for 2, 3 and 4 devices are:
> >> + 2 drives 3 drives 4 drives
> >> + -------- ------------ -----------------
> >> + A1 A2 A1 A2 A3 A1 A2 A3 A4
> >> + A2 A1 A3 A1 A2 A4 A1 A2 A3
> >> + A3 A4 A4 A5 A6 A5 A6 A7 A8
> >> + A4 A3 A6 A4 A5 A8 A5 A6 A7
> >> + A5 A6 A7 A8 A9 A9 A10 A11 A12
> >> + A6 A5 A9 A7 A8 A12 A9 A10 A11
> >
> > The same problem here with 2 failing drives (for the 4 drive case).
> > However I dont see an easy solution to this problem.
> >
> >> + Here we see layouts closely akin to 'RAID1E - Integrated
> >> + Offset Stripe Mirroring'.
> >> +
> >> + Thanks wikipedia 'Non-standard RAID levels' for the layout
> >> + figures:
> >> + http://en.wikipedia.org/wiki/Non-standard_RAID_levels
> >
> > Wikipedia may be in error wrt. the block orders.
> >
> > besT regards
> > Keld
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-13-2012, 01:27 AM
NeilBrown
 
Default DM RAID: Add support for MD RAID10

On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote:

> I think the layout you described should not be promoted at all,
> and only kept for backward compatibility. As there is no backward
> compatibility in your case I think it is an error to implement it.
> I understand that you do not reuse any of the MD code here?

Not correct. The whole point of this exercise is to reuse md code.


> The flaw is worse than Neil described, as far as I understand.
> With n=2 you can in the current implementation only have 1 disk failing,
> for any numbers of drives in the array. With the suggested layout
> then for 4 drives you have the probability of surviving 66 %
> of 2 drives failing. This get even better for 6, 8 .. disks in the array.
> And you may even survive 3 or more disk failures, dependent on the number
> of drives employed. The probability is the same as for raid-1+0

Also not correct. You can certainly have more than one failed device
providing you don't have 'n' adjacent devices all failed.
So e.g. if you have 2 drives in a far-2 layout then you can survive the
failure of three devices if they are 0,2,4 or 1,3,5.


>
> > When it is available to MD, I'll make it available to dm-raid also.
>
> Please dont implement it in the flawed way. It will just create a number of problems
> for when to switch over and convert between the two formats, and then which should
> be the default (I fear some would say the old flawed should be the default), and we need
> to explain the two formats and implement two sets of repairs and so on.

This "flawed" arrangement is the only one that makes sense for an odd number
of devices (assuming 2 copies).

NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-13-2012, 08:29 AM
 
Default DM RAID: Add support for MD RAID10

On Fri, Jul 13, 2012 at 11:27:17AM +1000, NeilBrown wrote:
> On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote:
>
> > I think the layout you described should not be promoted at all,
> > and only kept for backward compatibility. As there is no backward
> > compatibility in your case I think it is an error to implement it.
> > I understand that you do not reuse any of the MD code here?
>
> Not correct. The whole point of this exercise is to reuse md code.

OK, I also think it is only sensible to reuse the code already done.
I misunderstood then your mail on not to repeat mistakes - which I took to mean that
Barrow should not implement things with mistakes. Maybe that means to not make hooks
to MD code that is a mistake?

So Barrow will implement the improved far layout once there is MD code for it, and
then he can make the neceessary hooks in DM code?

> > The flaw is worse than Neil described, as far as I understand.
> > With n=2 you can in the current implementation only have 1 disk failing,
> > for any numbers of drives in the array. With the suggested layout
> > then for 4 drives you have the probability of surviving 66 %
> > of 2 drives failing. This get even better for 6, 8 .. disks in the array.
> > And you may even survive 3 or more disk failures, dependent on the number
> > of drives employed. The probability is the same as for raid-1+0
>
> Also not correct. You can certainly have more than one failed device
> providing you don't have 'n' adjacent devices all failed.
> So e.g. if you have 2 drives in a far-2 layout then you can survive the
> failure of three devices if they are 0,2,4 or 1,3,5.

On further investigations I agree that you can survive more than one drive failing with
the current layout.

> > > When it is available to MD, I'll make it available to dm-raid also.
> >
> > Please dont implement it in the flawed way. It will just create a number of problems
> > for when to switch over and convert between the two formats, and then which should
> > be the default (I fear some would say the old flawed should be the default), and we need
> > to explain the two formats and implement two sets of repairs and so on.
>
> This "flawed" arrangement is the only one that makes sense for an odd number
> of devices (assuming 2 copies).

Well, I have an idea for the odd number of devices:
Have the disks arranged in groups (for N=2 in pairs) and then the last group extended with
the leftover disks in the way it is done now.

For 2 copies, this would be a number of pairs, and then a rest group of 3 disks.
For 3 copies, this would be a number of triplets, and then 4 or 5 disks in the last group.

Can I assume, Neil, that you agree with the rest I wrote? :-)
Especially that we should only advice the new layout, and there is no reason for the
current implementation except for backwards compatibility?

best regards
keld

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 07-16-2012, 06:14 AM
NeilBrown
 
Default DM RAID: Add support for MD RAID10

On Fri, 13 Jul 2012 10:29:23 +0200 keld@keldix.com wrote:

> On Fri, Jul 13, 2012 at 11:27:17AM +1000, NeilBrown wrote:
> > On Fri, 13 Jul 2012 03:15:05 +0200 keld@keldix.com wrote:
> >
> > > I think the layout you described should not be promoted at all,
> > > and only kept for backward compatibility. As there is no backward
> > > compatibility in your case I think it is an error to implement it.
> > > I understand that you do not reuse any of the MD code here?
> >
> > Not correct. The whole point of this exercise is to reuse md code.
>
> OK, I also think it is only sensible to reuse the code already done.
> I misunderstood then your mail on not to repeat mistakes - which I took to mean that
> Barrow should not implement things with mistakes. Maybe that means to not make hooks
> to MD code that is a mistake?
>
> So Barrow will implement the improved far layout once there is MD code for it, and
> then he can make the neceessary hooks in DM code?
>
> > > The flaw is worse than Neil described, as far as I understand.
> > > With n=2 you can in the current implementation only have 1 disk failing,
> > > for any numbers of drives in the array. With the suggested layout
> > > then for 4 drives you have the probability of surviving 66 %
> > > of 2 drives failing. This get even better for 6, 8 .. disks in the array.
> > > And you may even survive 3 or more disk failures, dependent on the number
> > > of drives employed. The probability is the same as for raid-1+0
> >
> > Also not correct. You can certainly have more than one failed device
> > providing you don't have 'n' adjacent devices all failed.
> > So e.g. if you have 2 drives in a far-2 layout then you can survive the
> > failure of three devices if they are 0,2,4 or 1,3,5.
>
> On further investigations I agree that you can survive more than one drive failing with
> the current layout.
>
> > > > When it is available to MD, I'll make it available to dm-raid also.
> > >
> > > Please dont implement it in the flawed way. It will just create a number of problems
> > > for when to switch over and convert between the two formats, and then which should
> > > be the default (I fear some would say the old flawed should be the default), and we need
> > > to explain the two formats and implement two sets of repairs and so on.
> >
> > This "flawed" arrangement is the only one that makes sense for an odd number
> > of devices (assuming 2 copies).
>
> Well, I have an idea for the odd number of devices:
> Have the disks arranged in groups (for N=2 in pairs) and then the last group extended with
> the leftover disks in the way it is done now.
>
> For 2 copies, this would be a number of pairs, and then a rest group of 3 disks.
> For 3 copies, this would be a number of triplets, and then 4 or 5 disks in the last group.

Certainly possible, but it feels clumsy. I'm not convinced it is a good idea.

>
> Can I assume, Neil, that you agree with the rest I wrote? :-)

You can agree that I don't strongly disagree...

> Especially that we should only advice the new layout, and there is no reason for the
> current implementation except for backwards compatibility?

The main reason for the current implementation is that is currently
implemented.
Until an alternate implementation exists, it seems pointless to recommend
that people use it.
Maybe you are suggesting that dmraid should not support raid10-far or
raid10-offset until the "new" approach is implemented.
Maybe that is sensible, but only if someone steps forwards and actually
implements the "new" approach.

NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 

Thread Tools




All times are GMT. The time now is 10:38 AM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.
Copyright 2007 - 2008, www.linux-archive.org