Linux Archive

Linux Archive (http://www.linux-archive.org/)
-   Device-mapper Development (http://www.linux-archive.org/device-mapper-development/)
-   -   reworked dm-switch target (http://www.linux-archive.org/device-mapper-development/694428-reworked-dm-switch-target.html)

Mikulas Patocka 08-15-2012 10:36 PM

reworked dm-switch target
 
This is simplified dm-switch target, originally written by Jim Ramsay.

Changes from the original:

Removed netlink interface and added dm message interface to change
mapping table because the message interface is noticeably simpler.
The table is changed by sending dm message:
"dmsetup message <device-name> 0 set-table <commands...>"
The mesage can have multiple commands, each command has format
"<page>:<device index>" or "<start page>-<end page>:<device index>"
The page or pages in the specified range are remapped to the device with
the given intex.
For example "dmsetup message switch 0 set-table 0-15:0 16-31:1 32-33:2"
sets pages 0-15 to device 0, 16-31 to device 1, 32-33 to device 2.

The dm-switch.h file was removed (if the netlink was removed, there is
no need for this file).

Page table is allocated using vmalloc instead of kmalloc. kmalloc
allocates physically contiguous memory and it can fail if memory is
fragmented. vmalloc allocates discontiguous memory and maps it to a
contiguous virtual address range using MMU.

RCU and page table reallocation was removed. The page table is allocated
in the constructor and stays the same for the lifetime of the device.
The page table can be read and modified at the same time, so there is no
need to use RCU.

The page table is initialized with a repetitive pattern that uses all
the devices.

One page table entry has 64-bit size on 64-bit processors and 32-bit
size on 32-bit processors (in the original it was always 32-bit). Making
it 64-bit makes it consume slightly less space in some cases.

Removed dm status:
- ios_remapped/ios_unmapped counting was removed because all the IOs are
mapped when statically allocated page table is used.
- Userspace-supplied numbers that are reported in the status were
removed because it is not clear what were they used for.
- The device list with 'A' statuses was removed (it could be added back
if we implement device error tracking); there was just mock code that
returned 'A' for all devices.

Device limit check was simplified to use i_size_read and fixed to take
account of 'start' value as well.

do_div was replaced with sector_div - if we have 32-bit sectors, we
don't need to do slow 64-bit math.

The divisions were optimized if the divisor is a power of two.

Set dm_set_target_max_io_len. The original code didn't set it, so it
could issue IOs that span page boundaries.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
drivers/md/Kconfig | 11 +
drivers/md/Makefile | 1
drivers/md/dm-switch.c | 419 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 431 insertions(+)

Index: linux-3.5.1-fast/drivers/md/Kconfig
================================================== =================
--- linux-3.5.1-fast.orig/drivers/md/Kconfig 2012-08-16 00:29:55.000000000 +0200
+++ linux-3.5.1-fast/drivers/md/Kconfig 2012-08-16 00:30:14.000000000 +0200
@@ -417,4 +417,15 @@ config DM_VERITY2

source "drivers/md/enhanceio/Kconfig"

+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Help text needs writing
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
Index: linux-3.5.1-fast/drivers/md/Makefile
================================================== =================
--- linux-3.5.1-fast.orig/drivers/md/Makefile 2012-08-16 00:29:55.000000000 +0200
+++ linux-3.5.1-fast/drivers/md/Makefile 2012-08-16 00:30:14.000000000 +0200
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o
obj-$(CONFIG_DM_ENHANCEIO) += enhanceio/
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o

ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
Index: linux-3.5.1-fast/drivers/md/dm-switch.c
================================================== =================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.5.1-fast/drivers/md/dm-switch.c 2012-08-16 00:35:03.000000000 +0200
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2010-2011 by Dell, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ * file: dm-switch.c
+ * authors: Kevin_OKelley@dell.com
+ * Jim_Ramsay@dell.com
+ * Narendran_Ganapathy@dell.com
+ * mpatocka@redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ * (1) a large number of address regions
+ * (2) a fixed size equal across all address regions
+ * (3) no pattern than allows for a compact description with something like
+ * the dm-stripe target.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+typedef unsigned long pt_entry;
+
+/* Switch context header */
+struct switch_ctx {
+ unsigned dev_count; /* Number of devices */
+ unsigned page_size; /* Page size in 512B sectors */
+ unsigned long n_pages; /* Number of pages */
+ signed char page_size_bits; /* log2 of page_size or -1 */
+
+ unsigned char pte_size; /* Page table entry size in bits */
+ unsigned char pte_fields; /* Number of entries per pt_entry */
+ signed char pte_fields_bits; /* log2 of pte_fields or -1 */
+ pt_entry *page_table; /* Page table */
+
+ /* Array of dm devices to switch between */
+ struct switch_dev dev_list[0];
+};
+
+static inline void switch_get_position(struct switch_ctx *pctx,
+ unsigned long page,
+ unsigned long *index,
+ unsigned *bit)
+
+{
+ if (pctx->pte_fields_bits >= 0) {
+ *index = page >> pctx->pte_fields_bits;
+ *bit = page & (pctx->pte_fields - 1);
+ } else {
+ *index = page / pctx->pte_fields;
+ *bit = page % pctx->pte_fields;
+ }
+ *bit *= pctx->pte_size;
+
+}
+
+static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
+ unsigned value)
+{
+ unsigned long index;
+ unsigned bit;
+ pt_entry pte;
+
+ switch_get_position(pctx, page, &index, &bit);
+
+ pte = pctx->page_table[index];
+ pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
+ pte |= (pt_entry)value << bit;
+ pctx->page_table[index] = pte;
+}
+
+/*
+ * Constructor: Called each time a dmsetup command creates a dm device. The
+ * target parameter will already have the table, type, begin and len fields
+ * filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get
+ * multiple constructor calls, but we will need to build a list of switch_ctx
+ * blocks so that the page table information gets matched to the correct
+ * device.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ unsigned a;
+ int n;
+ int r;
+ unsigned dev_count;
+ struct switch_ctx *pctx;
+ sector_t dev_size;
+ unsigned long e;
+
+ if (argc < 4) {
+ ti->error = "Insufficient arguments";
+ r = -EINVAL;
+ goto error;
+ }
+ if (kstrtouint(argv[0], 10, &dev_count) ||
+ !dev_count ||
+ dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+ ti->error = "Invalid device count";
+ r = -EINVAL;
+ goto error;
+ }
+ if (dev_count != (argc - 2) / 2) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto error;
+ }
+ pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
+ GFP_KERNEL);
+ if (!pctx) {
+ ti->error = "Cannot allocate redirect context";
+ r = -ENOMEM;
+ goto error;
+ }
+ pctx->dev_count = dev_count;
+ if (kstrtouint(argv[1], 10, &pctx->page_size) ||
+ !pctx->page_size) {
+ ti->error = "Invalid page size";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (!(pctx->page_size & (pctx->page_size - 1)))
+ pctx->page_size_bits = __ffs(pctx->page_size);
+ else
+ pctx->page_size_bits = -1;
+
+ pctx->pte_size = 1;
+ while (pctx->pte_size < sizeof(pt_entry) * 8 &&
+ (pt_entry)1 << pctx->pte_size < pctx->dev_count)
+ pctx->pte_size++;
+
+ pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
+ if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
+ pctx->pte_fields_bits = __ffs(pctx->pte_fields);
+ else
+ pctx->pte_fields_bits = -1;
+
+ dev_size = ti->len;
+ if (sector_div(dev_size, pctx->page_size))
+ dev_size++;
+
+ pctx->n_pages = dev_size;
+ if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (sector_div(dev_size, pctx->pte_fields))
+ dev_size++;
+
+ if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ r = dm_set_target_max_io_len(ti, pctx->page_size);
+ if (r)
+ goto error_kfree;
+
+ pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
+ if (!pctx->page_table) {
+ ti->error = "Cannot allocate page table";
+ r = -ENOMEM;
+ goto error_kfree;
+ }
+
+ a = 0;
+ for (e = 0; e < pctx->n_pages; e++) {
+ switch_page_table_write(pctx, e, a);
+ a++;
+ if (a >= pctx->dev_count)
+ a = 0;
+ }
+
+ /*
+ * Check each device beneath the target to ensure that the limits are
+ * consistent.
+ */
+ for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
+ struct dm_dev *dm;
+ sector_t dev_size;
+ unsigned long long start;
+
+ if (kstrtoull(argv[a + 1], 10, &start) ||
+ start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ r = -EINVAL;
+ n--;
+ goto error_release_n;
+ }
+ r = dm_get_device
+ (ti, argv[a], dm_table_get_mode(ti->table), &dm);
+ if (r) {
+ ti->error = "Device lookup failed";
+ n--;
+ goto error_release_n;
+ }
+ pctx->dev_list[n].dmdev = dm;
+ pctx->dev_list[n].start = start;
+
+ dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (ti->len > start + dev_size) {
+ ti->error = "Device is too small";
+ r = -EINVAL;
+ goto error_release_n;
+ }
+ }
+
+ ti->private = pctx;
+
+ return 0;
+
+error_release_n: /* De-reference all devices */
+ for (; n >= 0; n--)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+error_kfree:
+ kfree(pctx);
+
+error:
+ return r;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ int n;
+ struct switch_ctx *pctx = ti->private;
+
+ for (n = 0; n < pctx->dev_count; n++)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+ kfree(pctx);
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct switch_ctx *pctx = ti->private;
+
+ sector_t offset = bio->bi_sector - ti->begin;
+ sector_t p;
+ unsigned long index;
+ unsigned bit, idev;
+
+ p = offset;
+ if (pctx->page_size_bits >= 0)
+ p >>= pctx->page_size_bits;
+ else
+ sector_div(p, pctx->page_size);
+
+ switch_get_position(pctx, p, &index, &bit);
+
+ idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) & ((1 << pctx->pte_size) - 1);
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(idev >= pctx->dev_count))
+ idev = 0;
+
+ bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
+ bio->bi_sector = pctx->dev_list[idev].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *pctx = ti->private;
+ int r;
+
+ mutex_lock(&message_mutex);
+
+ if (!argc) {
+ goto invalid_message;
+ } else if (!strcasecmp(argv[0], "set-table")) {
+ unsigned i;
+ for (i = 1; i < argc; i++) {
+ unsigned long long from, to;
+ unsigned device;
+ char dummy;
+ if (sscanf(argv[i], "%llu-%llu:%u%c", &from, &to, &device, &dummy) == 3)
+ goto do_set_table;
+ if (sscanf(argv[i], "%llu:%u%c", &from, &device, &dummy) == 2) {
+ to = from;
+ goto do_set_table;
+ }
+ DMWARN("invalid set-table argument");
+ r = -EINVAL;
+ goto ret;
+do_set_table:
+ if (from > to || to >= pctx->n_pages) {
+ DMWARN("invalid set-table page");
+ r = -EINVAL;
+ goto ret;
+ }
+ if (device >= pctx->dev_count) {
+ DMWARN("invalid set-table device");
+ r = -EINVAL;
+ goto ret;
+ }
+ for (; from <= to; from++)
+ switch_page_table_write(pctx, from, device);
+ }
+ r = 0;
+ } else {
+invalid_message:
+ DMWARN("unrecognised message received.");
+ r = -EINVAL;
+ }
+ret:
+ mutex_unlock(&message_mutex);
+ return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *pctx = ti->private;
+ unsigned sz = 0;
+ int n;
+
+ result[0] = '';
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = 0;
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
+ for (n = 0; n < pctx->dev_count; n++) {
+ DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
+ (unsigned long long)pctx->dev_list[n].start);
+ }
+ break;
+
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the first path.
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *pctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+
+ bdev = pctx->dev_list[0].dmdev->bdev;
+ mode = pctx->dev_list[0].dmdev->mode;
+
+ return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+};
+
+int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r) {
+ DMERR("dm_register_target() failed %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+void dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Alasdair G Kergon 08-16-2012 07:09 AM

reworked dm-switch target
 
On Wed, Aug 15, 2012 at 06:36:55PM -0400, Mikulas Patocka wrote:
> For example "dmsetup message switch 0 set-table 0-15:0 16-31:1 32-33:2"

What I think we need to know now is how large this makes a typical message to
set up the table. Set up can be split between several messages.
Or is typical data sufficiently random and large that a more-compact
might be a better idea?

Alasdair

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

08-17-2012 02:18 PM

reworked dm-switch target
 
Hi Alasdair and Mikulas. It's great to hear this feedback, thanks!

We had a few comments about the latest changes:

1) Uploading large page tables

As Alasdair mentioned, a more compact method of sending the page table will be necessary. Consider a volume with a page table that consists of 1572864 entries in total. On our storage solution, the pages are spread out among different group members (i.e. underlying DM devices) and it is uncommon to see long stretches mapping to the same device. The reason for this is similar to the principle of striping, attempting to maximize the chance of simultaneous accesses on multiple group members. This means that there is little gain in having a message format that allows a contiguous range of pages to be set to the same value.

Assuming a fairly well-distributed layout of 1572864 pages where 50% of the pages are different every other page, 20% are different every 2 pages, 10% every 5 pages, 10% every 10 pages, and 10% every 20 pages, this would leave us with a dmsetup message with argc=998768

dmsetup message switch 0 set-table 0-0:1 1-1:0 2-2:2 3-3:1 4-4:0 5-5:2 6-6:0 7-8:1 9-15:2 16-16:1 ... (plus almost 1000000 more arguments...)

We agree that using 'dmestup message' is a lot cleaner than using a netlink socket in a number of ways, but the fact that it's argc/argv space-delimited shell-parsed data makes it more difficult to send large amounts of binary data like the bit-compressed page table. We would be fine leaving in the proposed syntax for setting specific pages, as it may be useful to others and in small device testing scenarios, but an additional mechanism to upload larger chunks of binary data all at once would be important for our use of the device.

Perhaps we can work with you on designing alternate non-netlink mechanism to achieve the same goal... A sysfs file per DM device for userland processes to do direct I/O with? Base64-encoding larger chunks of the binary page tables and passing those values through 'dmsetup message'?

2) vmalloc and TLB performance

Having a (virtually) contiguous memory range certainly simplifies the allocation and lookup algorithms, but what about the concerns about vmalloc() that are summarized nicely in Documentation/flexible-arrays.txt:

"Large contiguous memory allocations can be unreliable in the Linux kernel.
Kernel programmers will sometimes respond to this problem by allocating
pages with vmalloc(). This solution not ideal, though. On 32-bit systems,
memory from vmalloc() must be mapped into a relatively small address space;
it's easy to run out. On SMP systems, the page table changes required by
vmalloc() allocations can require expensive cross-processor interrupts on
all CPUs. And, on all systems, use of space in the vmalloc() range
increases pressure on the translation lookaside buffer (TLB), reducing the
performance of the system."

The page table lookup is in the I/O path, so performance is an important consideration. Do you have any performance comparisons between our existing 2-level lookup of kmalloc'd memory versus a single vmalloc'd memory lookup? Multiple devices of similarly large table size may be in use simultaneously, so this needs consideration as well.

Also, in the example above with 1572864 page table entries, assuming 2 bits per entry requires a table of 384KB. Would this be a problem for the vmalloc system, especially on 32-bit systems, if there are multiple devices of similarly large size in use at the same time?

It can also be desirable to allow sparsely-populated page tables, when it is known that large chunks are not needed or deemed (by external logic) not important enough to consume kernel memory. A 2-level kmalloc'd memory scheme can save memory in sparsely-allocated situations.

3) Userland values and counters

The "user saved" values are useful for debugging purposes. For example, we had been using the 0th field for a timestamp so it's easy to manually validate when the last page table upload succeeded, and the other field for a count of the number of page table entries uploaded so far, but these could be used for other checkpointing or checksumming by userland processes.

Also, while we had not yet implemented a mechanism to retrieve the per-chunk hit counters, this would be valuable to have for a userland process to decide which chunks of the page table are "hot" for a sparsely-populated situation.

4) num_discard_requests, num_flush_requests, and iterate_devices

I have a slightly updated version of driver that implements these DM target features as well. I was actually preparing to submit the changes to this list when this conversation began, and will be doing so shortly.

--
Jim Ramsay

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Mikulas Patocka 08-20-2012 07:20 PM

reworked dm-switch target
 
Hi

On Fri, 17 Aug 2012, Jim_Ramsay@DELL.com wrote:

> Hi Alasdair and Mikulas. It's great to hear this feedback, thanks!
>
> We had a few comments about the latest changes:
>
> 1) Uploading large page tables
>
> As Alasdair mentioned, a more compact method of sending the page table
> will be necessary. Consider a volume with a page table that consists of
> 1572864 entries in total. On our storage solution, the pages are spread
> out among different group members (i.e. underlying DM devices) and it is
> uncommon to see long stretches mapping to the same device. The reason for
> this is similar to the principle of striping, attempting to maximize the
> chance of simultaneous accesses on multiple group members. This means
> that there is little gain in having a message format that allows a
> contiguous range of pages to be set to the same value.
>
> Assuming a fairly well-distributed layout of 1572864 pages where 50% of
> the pages are different every other page, 20% are different every 2 pages,
> 10% every 5 pages, 10% every 10 pages, and 10% every 20 pages, this would
> leave us with a dmsetup message with argc=998768
>
> dmsetup message switch 0 set-table 0-0:1 1-1:0 2-2:2 3-3:1 4-4:0 5-5:2 6-6:0 7-8:1 9-15:2 16-16:1 ... (plus almost 1000000 more arguments...)

You don't have to use the dash, you can send:
dmsetup message switch 0 set-table 0:1 1:0 2:2 3:1 4:0 ... etc.

You don't have to send the whole table at once in one message. Using
message with 998768 arguments is bad (it can trigger allocation failures
in the kernel).

But you can split the initial table load into several messages, each
having up to 4096 bytes, so that it fits into a single page.

> We agree that using 'dmestup message' is a lot cleaner than using a
> netlink socket in a number of ways, but the fact that it's argc/argv
> space-delimited shell-parsed data makes it more difficult to send large
> amounts of binary data like the bit-compressed page table. We would be
> fine leaving in the proposed syntax for setting specific pages, as it may
> be useful to others and in small device testing scenarios, but an
> additional mechanism to upload larger chunks of binary data all at once
> would be important for our use of the device.
>
> Perhaps we can work with you on designing alternate non-netlink mechanism
> to achieve the same goal... A sysfs file per DM device for userland
> processes to do direct I/O with? Base64-encoding larger chunks of the
> binary page tables and passing those values through 'dmsetup message'?

As I said, you don't have to upload the whole table with one message ...
or if you really need to update the whole table at once, explain why.

> 2) vmalloc and TLB performance
>
> Having a (virtually) contiguous memory range certainly simplifies the
> allocation and lookup algorithms, but what about the concerns about
> vmalloc() that are summarized nicely in Documentation/flexible-arrays.txt:
>
> "Large contiguous memory allocations can be unreliable in the Linux kernel.
> Kernel programmers will sometimes respond to this problem by allocating
> pages with vmalloc(). This solution not ideal, though. On 32-bit systems,
> memory from vmalloc() must be mapped into a relatively small address space;
> it's easy to run out.

The original code uses a simple kmalloc to allocate the whole table.

The maximum size allocatable with kmalloc is 4MB.

The minimum vmalloc arena is 128MB (on x86) - so the switch from kmalloc
to vmalloc makes it no worse.

> On SMP systems, the page table changes required by
> vmalloc() allocations can require expensive cross-processor interrupts on
> all CPUs.

vmalloc is used only once when the target is loaded, so performance is not
an issue here.

> And, on all systems, use of space in the vmalloc() range
> increases pressure on the translation lookaside buffer (TLB), reducing the
> performance of the system."
>
> The page table lookup is in the I/O path, so performance is an important
> consideration. Do you have any performance comparisons between our
> existing 2-level lookup of kmalloc'd memory versus a single vmalloc'd

There was just 1-level lookup in the original dm-switch patch. Did you add
2-level lookup recently?

> memory lookup? Multiple devices of similarly large table size may be in
> use simultaneously, so this needs consideration as well.
>
> Also, in the example above with 1572864 page table entries, assuming 2
> bits per entry requires a table of 384KB. Would this be a problem for the
> vmalloc system, especially on 32-bit systems, if there are multiple
> devices of similarly large size in use at the same time?

384KB is not a problem, the whole vmalloc space has 128MB.

> It can also be desirable to allow sparsely-populated page tables, when it
> is known that large chunks are not needed or deemed (by external logic)
> not important enough to consume kernel memory. A 2-level kmalloc'd memory
> scheme can save memory in sparsely-allocated situations.

> 3) Userland values and counters
>
> The "user saved" values are useful for debugging purposes. For example,
> we had been using the 0th field for a timestamp so it's easy to manually
> validate when the last page table upload succeeded, and the other field
> for a count of the number of page table entries uploaded so far, but these
> could be used for other checkpointing or checksumming by userland
> processes.
>
> Also, while we had not yet implemented a mechanism to retrieve the
> per-chunk hit counters, this would be valuable to have for a userland
> process to decide which chunks of the page table are "hot" for a
> sparsely-populated situation.
>
> 4) num_discard_requests, num_flush_requests, and iterate_devices
>
> I have a slightly updated version of driver that implements these DM
> target features as well. I was actually preparing to submit the changes
> to this list when this conversation began, and will be doing so shortly.
>
> --
> Jim Ramsay

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Alasdair G Kergon 08-20-2012 08:48 PM

reworked dm-switch target
 
On Fri, Aug 17, 2012 at 02:18:15PM +0000, Jim_Ramsay@dell.com wrote:
> dmsetup message switch 0 set-table 0-0:1 1-1:0 2-2:2 3-3:1 4-4:0 5-5:2 6-6:0 7-8:1 9-15:2 16-16:1 ... (plus almost 1000000 more arguments...)

Or 0,3,16:1 1,4:0 2,5:2 6:0 -8:1 -15:2
with three short-hands:
0-0 -> 0
6:0 7-8:1 -> 6:0 -8:1 (missing start of range assumes continues from last one)
0:0 3:0 -> 0,3:0 (list)

> an additional mechanism to upload larger chunks
> of binary data all at once would be important for our use of the device.

The message mechanism could probably be extended to accept blobs of binary data
if parsing so many numbers turns out to be too inefficient. Now we have an
example to work with, we can check the speed.

Alasdair

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Jim Ramsay 08-21-2012 04:33 PM

reworked dm-switch target
 
On Mon, Aug 20, 2012 at 03:20:42PM -0400, Mikulas Patocka wrote:
> On Fri, 17 Aug 2012, Jim_Ramsay@DELL.com wrote:
> > 1) Uploading large page tables
<snip>
> > Assuming a fairly well-distributed layout of 1572864 pages where 50% of
> > the pages are different every other page, 20% are different every 2 pages,
> > 10% every 5 pages, 10% every 10 pages, and 10% every 20 pages, this would
> > leave us with a dmsetup message with argc=998768
> >
> > dmsetup message switch 0 set-table 0-0:1 1-1:0 2-2:2 3-3:1 4-4:0 5-5:2 6-6:0 7-8:1 9-15:2 16-16:1 ... (plus almost 1000000 more arguments...)
>
> You don't have to use the dash, you can send:
> dmsetup message switch 0 set-table 0:1 1:0 2:2 3:1 4:0 ... etc.
>
> You don't have to send the whole table at once in one message. Using
> message with 998768 arguments is bad (it can trigger allocation failures
> in the kernel).
>
> But you can split the initial table load into several messages, each
> having up to 4096 bytes, so that it fits into a single page.

Even removing the '-' for single-page sets, you're looking at having to
send 4 bytes minimum per page (and as the index of the page you're
indexing increases significantly, it takes many more bytes to represent
a page), which means that each 4096-byte run would have maybe 1000 page
table entries in it at most.

This would mean that to upload an entire page table for my example
volume, we would have to run 'dmsetup message ...' almost 1000 times.

I'm sure we can come up with other syntactical shortcuts like those
Alasdair came up with, but encoding into any ascii format will always be
less space-efficient than a pure binary transfer.

> > Perhaps we can work with you on designing alternate non-netlink mechanism
> > to achieve the same goal... A sysfs file per DM device for userland
> > processes to do direct I/O with? Base64-encoding larger chunks of the
> > binary page tables and passing those values through 'dmsetup message'?
>
> As I said, you don't have to upload the whole table with one message ...
> or if you really need to update the whole table at once, explain why.

At the very least, we would need to update the whole page table in the
following scenarios:

1) When we first learn the geometry of the volume

2) When the volume layout changes significantly (for example, if it was
previously represented by 2 devices and is then later moved onto 3
devices, or the underlying LUN is resized)

3) When the protocol used to fetch the data can fetch segments of the
page table in a dense binary formate, it is considerably more work
for a userland processes to keep its own persistent copy of the
page table, compare a new version with the old version, calculate
the differences, and send only those differences. It is much
simpler to have a binary conduit to upload the entire table at
once, provided it does not occur too frequently.

Furthermore, if a userland process already has an internal binary
representation of a page map, what is the value in converting this into
a complicated human-readable ascii representation then having the kernel
do the opposite de-conversion when it receives the data?

> > 2) vmalloc and TLB performance
<snip>

> The original code uses a simple kmalloc to allocate the whole table.
>
> The maximum size allocatable with kmalloc is 4MB.
>
> The minimum vmalloc arena is 128MB (on x86) - so the switch from kmalloc
> to vmalloc makes it no worse.
>
> > On SMP systems, the page table changes required by
> > vmalloc() allocations can require expensive cross-processor interrupts on
> > all CPUs.
>
> vmalloc is used only once when the target is loaded, so performance is not
> an issue here.

The table would also have to be reallocated on LUN resize or if the data
is moved to be across a different number of devices (provided the change
is such that it causes the number of bits-per-page to be changed), such
as if you had a 2-device setup represented by 1-bit-per-page change to a
3-device setup represented by 2-bit-per-page.

Granted these are not frequent operations, but we need to continue to
properly handle these cases.

We also need to keep the multiple device scenario in mind (perhaps 100s of
targets in use or being created simultaneously).

> > And, on all systems, use of space in the vmalloc() range
> > increases pressure on the translation lookaside buffer (TLB), reducing the
> > performance of the system."
> >
> > The page table lookup is in the I/O path, so performance is an important
> > consideration. Do you have any performance comparisons between our
> > existing 2-level lookup of kmalloc'd memory versus a single vmalloc'd
>
> There was just 1-level lookup in the original dm-switch patch. Did you add
> 2-level lookup recently?

In October 2011 I posted a 'v3' version of our driver to the dm-devel
list that did this 2-stage lookup to the dm-devel list:

http://www.redhat.com/archives/dm-devel/2011-October/msg00109.html

The main consideration was to avoid single large kmalloc allocations,
but to also support sparse allocations in the future.

> > memory lookup? Multiple devices of similarly large table size may be in
> > use simultaneously, so this needs consideration as well.
> >
> > Also, in the example above with 1572864 page table entries, assuming 2
> > bits per entry requires a table of 384KB. Would this be a problem for the
> > vmalloc system, especially on 32-bit systems, if there are multiple
> > devices of similarly large size in use at the same time?
>
> 384KB is not a problem, the whole vmalloc space has 128MB.

This means we could allow ~375 similarly-sized devices in the system,
assuming no other kernel objects are consuming any vmalloc space. This
could be okay, provided our performance considerations are also
addressed, but allowing sparse allocation may be a good enough reason
to use a 2-level allocation scheme.

> > It can also be desirable to allow sparsely-populated page tables, when it
> > is known that large chunks are not needed or deemed (by external logic)
> > not important enough to consume kernel memory. A 2-level kmalloc'd memory
> > scheme can save memory in sparsely-allocated situations.

This ability to do sparse allocations may be important depending on what
else is going on in the kernel and using vmalloc space.

Thanks for your comments, and I do hope to send our 'v4' driver code as
well as a demonstration application with the netlink socket interface to
this list in the very near future.

--
Jim Ramsay

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Alasdair G Kergon 08-21-2012 06:14 PM

reworked dm-switch target
 
Well my order of preference for the interface at the moment would be:
ioctl: ASCII dm messages
ioctl: binary dm messages (i.e. the message content is treated as a binary blob)
mmap: lookups in shared memory (lockless preferably)
netlink

Alasdair

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Mikulas Patocka 08-22-2012 01:02 AM

reworked dm-switch target
 
On Tue, 21 Aug 2012, Jim Ramsay wrote:

> On Mon, Aug 20, 2012 at 03:20:42PM -0400, Mikulas Patocka wrote:
> > On Fri, 17 Aug 2012, Jim_Ramsay@DELL.com wrote:
> > > 1) Uploading large page tables
> <snip>
> > > Assuming a fairly well-distributed layout of 1572864 pages where 50% of
> > > the pages are different every other page, 20% are different every 2 pages,
> > > 10% every 5 pages, 10% every 10 pages, and 10% every 20 pages, this would
> > > leave us with a dmsetup message with argc=998768
> > >
> > > dmsetup message switch 0 set-table 0-0:1 1-1:0 2-2:2 3-3:1 4-4:0 5-5:2 6-6:0 7-8:1 9-15:2 16-16:1 ... (plus almost 1000000 more arguments...)
> >
> > You don't have to use the dash, you can send:
> > dmsetup message switch 0 set-table 0:1 1:0 2:2 3:1 4:0 ... etc.
> >
> > You don't have to send the whole table at once in one message. Using
> > message with 998768 arguments is bad (it can trigger allocation failures
> > in the kernel).
> >
> > But you can split the initial table load into several messages, each
> > having up to 4096 bytes, so that it fits into a single page.
>
> Even removing the '-' for single-page sets, you're looking at having to
> send 4 bytes minimum per page (and as the index of the page you're
> indexing increases significantly, it takes many more bytes to represent
> a page), which means that each 4096-byte run would have maybe 1000 page
> table entries in it at most.
>
> This would mean that to upload an entire page table for my example
> volume, we would have to run 'dmsetup message ...' almost 1000 times.
>
> I'm sure we can come up with other syntactical shortcuts like those
> Alasdair came up with, but encoding into any ascii format will always be
> less space-efficient than a pure binary transfer.

I converted the format to use hexadecimal numbers (they are faster to
produce and faster to parse) and made an option to omit the page number
(in this case, the previous page plus one is used) - and it takes 0.05s
to load a table with one million entries on 2.3GHz Opteron.

The table is loaded with 67 dm message calls, each having 45000 bytes
(the number 45000 was experimentally found to be near the optimum).

So I don't think there are performance problems with this.

I'll send you the program that updates the table with messages.

> > > Perhaps we can work with you on designing alternate non-netlink mechanism
> > > to achieve the same goal... A sysfs file per DM device for userland
> > > processes to do direct I/O with? Base64-encoding larger chunks of the
> > > binary page tables and passing those values through 'dmsetup message'?
> >
> > As I said, you don't have to upload the whole table with one message ...
> > or if you really need to update the whole table at once, explain why.
>
> At the very least, we would need to update the whole page table in the
> following scenarios:
>
> 1) When we first learn the geometry of the volume
>
> 2) When the volume layout changes significantly (for example, if it was
> previously represented by 2 devices and is then later moved onto 3
> devices, or the underlying LUN is resized)
>
> 3) When the protocol used to fetch the data can fetch segments of the
> page table in a dense binary formate, it is considerably more work
> for a userland processes to keep its own persistent copy of the
> page table, compare a new version with the old version, calculate
> the differences, and send only those differences. It is much
> simpler to have a binary conduit to upload the entire table at
> once, provided it does not occur too frequently.

But you don't have to upload the table at once - you can upload the table
incrementally with several dm messages.

> Furthermore, if a userland process already has an internal binary
> representation of a page map, what is the value in converting this into
> a complicated human-readable ascii representation then having the kernel
> do the opposite de-conversion when it receives the data?

The reason is simplicity - the dm message code is noticeably smaller than
the netlink code. It is also less bug-prone because no structures are
allocated or freed there.

> > > 2) vmalloc and TLB performance
> <snip>
>
> > The original code uses a simple kmalloc to allocate the whole table.
> >
> > The maximum size allocatable with kmalloc is 4MB.
> >
> > The minimum vmalloc arena is 128MB (on x86) - so the switch from kmalloc
> > to vmalloc makes it no worse.
> >
> > > On SMP systems, the page table changes required by
> > > vmalloc() allocations can require expensive cross-processor interrupts on
> > > all CPUs.
> >
> > vmalloc is used only once when the target is loaded, so performance is not
> > an issue here.
>
> The table would also have to be reallocated on LUN resize or if the data
> is moved to be across a different number of devices (provided the change
> is such that it causes the number of bits-per-page to be changed), such
> as if you had a 2-device setup represented by 1-bit-per-page change to a
> 3-device setup represented by 2-bit-per-page.
>
> Granted these are not frequent operations, but we need to continue to
> properly handle these cases.
>
> We also need to keep the multiple device scenario in mind (perhaps 100s of
> targets in use or being created simultaneously).

For these operations (resizing the device or changing the number of
underlying devices), you can load a new table, suspend the device and
resume the device. It will switch to the new table and destroy the old
one.

You have to reload the table anyway if you change device size, so there is
no need to include code to change table size in the target driver.

> > > And, on all systems, use of space in the vmalloc() range
> > > increases pressure on the translation lookaside buffer (TLB), reducing the
> > > performance of the system."
> > >
> > > The page table lookup is in the I/O path, so performance is an important
> > > consideration. Do you have any performance comparisons between our
> > > existing 2-level lookup of kmalloc'd memory versus a single vmalloc'd
> >
> > There was just 1-level lookup in the original dm-switch patch. Did you add
> > 2-level lookup recently?
>
> In October 2011 I posted a 'v3' version of our driver to the dm-devel
> list that did this 2-stage lookup to the dm-devel list:
>
> http://www.redhat.com/archives/dm-devel/2011-October/msg00109.html
>
> The main consideration was to avoid single large kmalloc allocations,
> but to also support sparse allocations in the future.
>
> > > memory lookup? Multiple devices of similarly large table size may be in
> > > use simultaneously, so this needs consideration as well.
> > >
> > > Also, in the example above with 1572864 page table entries, assuming 2
> > > bits per entry requires a table of 384KB. Would this be a problem for the
> > > vmalloc system, especially on 32-bit systems, if there are multiple
> > > devices of similarly large size in use at the same time?
> >
> > 384KB is not a problem, the whole vmalloc space has 128MB.
>
> This means we could allow ~375 similarly-sized devices in the system,
> assuming no other kernel objects are consuming any vmalloc space. This
> could be okay, provided our performance considerations are also
> addressed, but allowing sparse allocation may be a good enough reason
> to use a 2-level allocation scheme.
>
> > > It can also be desirable to allow sparsely-populated page tables, when it
> > > is known that large chunks are not needed or deemed (by external logic)
> > > not important enough to consume kernel memory. A 2-level kmalloc'd memory
> > > scheme can save memory in sparsely-allocated situations.
>
> This ability to do sparse allocations may be important depending on what
> else is going on in the kernel and using vmalloc space.

It may be possible to use radix tree and do sparse allocations, but given
the current usage (tables with million entries, each entry having a few
bits), it doesn't seem as a problem now.

> Thanks for your comments, and I do hope to send our 'v4' driver code as
> well as a demonstration application with the netlink socket interface to
> this list in the very near future.
>
> --
> Jim Ramsay

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Mikulas Patocka 08-22-2012 01:03 AM

reworked dm-switch target
 
This is a new version that uses hex numbers.

---

This is simplified dm-switch target, originally written by Jim Ramsay.

Changes from the original:

Removed netlink interface and added dm message interface to change
mapping table because the message interface is noticeably simpler.
The table is changed by sending dm message:
"dmsetup message <device-name> 0 set-table <commands...>"
The mesage can have multiple commands, each command has format
"<page>:<device index>" (sets specified page) or ":<device index>" (sets
previous page plus 1 to the specified index). <page> and <device index>
are in hexadecimal format.
For example "dmsetup message switch 0 set-table 3:0 :2 :7 F:4"
sets page 3 to device 0, page 4 to device 2, page 5 to device 7, page 15
to device 4.

The dm-switch.h file was removed (if the netlink was removed, there is
no need for this file).

Page table is allocated using vmalloc instead of kmalloc. kmalloc
allocates physically contiguous memory and it can fail if memory is
fragmented. vmalloc allocates discontiguous memory and maps it to a
contiguous virtual address range using MMU.

RCU and page table reallocation was removed. The page table is allocated
in the constructor and stays the same for the lifetime of the device.
The page table can be read and modified at the same time, so there is no
need to use RCU.

The page table is initialized with a repetitive pattern that uses all
the devices.

One page table entry has 64-bit size on 64-bit processors and 32-bit
size on 32-bit processors (in the original it was always 32-bit). Making
it 64-bit makes it consume slightly less space in some cases.

Removed dm status:
- ios_remapped/ios_unmapped counting was removed because all the IOs are
mapped when statically allocated page table is used.
- Userspace-supplied numbers that are reported in the status were
removed because it is not clear what were they used for.
- The device list with 'A' statuses was removed (it could be added back
if we implement device error tracking); there was just mock code that
returned 'A' for all devices.

Device limit check was simplified to use i_size_read and fixed to take
account of 'start' value as well.

do_div was replaced with sector_div - if we have 32-bit sectors, we
don't need to do slow 64-bit math.

The divisions were optimized if the divisor is a power of two.

Set dm_set_target_max_io_len. The original code didn't set it, so it
could issue IOs that span page boundaries.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
drivers/md/Kconfig | 11 +
drivers/md/Makefile | 1
drivers/md/dm-switch.c | 485 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 497 insertions(+)

Index: linux-3.5.2-fast/drivers/md/Kconfig
================================================== =================
--- linux-3.5.2-fast.orig/drivers/md/Kconfig 2012-08-22 02:03:19.000000000 +0200
+++ linux-3.5.2-fast/drivers/md/Kconfig 2012-08-22 02:04:01.000000000 +0200
@@ -417,4 +417,15 @@ config DM_VERITY2

source "drivers/md/enhanceio/Kconfig"

+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Help text needs writing
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
Index: linux-3.5.2-fast/drivers/md/Makefile
================================================== =================
--- linux-3.5.2-fast.orig/drivers/md/Makefile 2012-08-22 02:03:19.000000000 +0200
+++ linux-3.5.2-fast/drivers/md/Makefile 2012-08-22 02:04:01.000000000 +0200
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o
obj-$(CONFIG_DM_ENHANCEIO) += enhanceio/
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o

ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
Index: linux-3.5.2-fast/drivers/md/dm-switch.c
================================================== =================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.5.2-fast/drivers/md/dm-switch.c 2012-08-22 03:00:50.000000000 +0200
@@ -0,0 +1,485 @@
+/*
+ * Copyright (c) 2010-2011 by Dell, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ * file: dm-switch.c
+ * authors: Kevin_OKelley@dell.com
+ * Jim_Ramsay@dell.com
+ * Narendran_Ganapathy@dell.com
+ * mpatocka@redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ * (1) a large number of address regions
+ * (2) a fixed size equal across all address regions
+ * (3) no pattern than allows for a compact description with something like
+ * the dm-stripe target.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device-mapper.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+typedef unsigned long pt_entry;
+
+/* Switch context header */
+struct switch_ctx {
+ unsigned dev_count; /* Number of devices */
+ unsigned page_size; /* Page size in 512B sectors */
+ unsigned long n_pages; /* Number of pages */
+ signed char page_size_bits; /* log2 of page_size or -1 */
+
+ unsigned char pte_size; /* Page table entry size in bits */
+ unsigned char pte_fields; /* Number of entries per pt_entry */
+ signed char pte_fields_bits; /* log2 of pte_fields or -1 */
+ pt_entry *page_table; /* Page table */
+
+ /* Array of dm devices to switch between */
+ struct switch_dev dev_list[0];
+};
+
+static inline void switch_get_position(struct switch_ctx *pctx,
+ unsigned long page,
+ unsigned long *index,
+ unsigned *bit)
+
+{
+ if (pctx->pte_fields_bits >= 0) {
+ *index = page >> pctx->pte_fields_bits;
+ *bit = page & (pctx->pte_fields - 1);
+ } else {
+ *index = page / pctx->pte_fields;
+ *bit = page % pctx->pte_fields;
+ }
+ *bit *= pctx->pte_size;
+
+}
+
+static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
+ unsigned value)
+{
+ unsigned long index;
+ unsigned bit;
+ pt_entry pte;
+
+ switch_get_position(pctx, page, &index, &bit);
+
+ pte = pctx->page_table[index];
+ pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
+ pte |= (pt_entry)value << bit;
+ pctx->page_table[index] = pte;
+}
+
+/*
+ * Constructor: Called each time a dmsetup command creates a dm device. The
+ * target parameter will already have the table, type, begin and len fields
+ * filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get
+ * multiple constructor calls, but we will need to build a list of switch_ctx
+ * blocks so that the page table information gets matched to the correct
+ * device.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ unsigned a;
+ int n;
+ int r;
+ unsigned dev_count;
+ struct switch_ctx *pctx;
+ sector_t dev_size;
+ unsigned long e;
+
+ if (argc < 4) {
+ ti->error = "Insufficient arguments";
+ r = -EINVAL;
+ goto error;
+ }
+ if (kstrtouint(argv[0], 10, &dev_count) ||
+ !dev_count ||
+ dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+ ti->error = "Invalid device count";
+ r = -EINVAL;
+ goto error;
+ }
+ if (dev_count != (argc - 2) / 2) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto error;
+ }
+ pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
+ GFP_KERNEL);
+ if (!pctx) {
+ ti->error = "Cannot allocate redirect context";
+ r = -ENOMEM;
+ goto error;
+ }
+ pctx->dev_count = dev_count;
+ if (kstrtouint(argv[1], 10, &pctx->page_size) ||
+ !pctx->page_size) {
+ ti->error = "Invalid page size";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (!(pctx->page_size & (pctx->page_size - 1)))
+ pctx->page_size_bits = __ffs(pctx->page_size);
+ else
+ pctx->page_size_bits = -1;
+
+ pctx->pte_size = 1;
+ while (pctx->pte_size < sizeof(pt_entry) * 8 &&
+ (pt_entry)1 << pctx->pte_size < pctx->dev_count)
+ pctx->pte_size++;
+
+ pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
+ if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
+ pctx->pte_fields_bits = __ffs(pctx->pte_fields);
+ else
+ pctx->pte_fields_bits = -1;
+
+ dev_size = ti->len;
+ if (sector_div(dev_size, pctx->page_size))
+ dev_size++;
+
+ pctx->n_pages = dev_size;
+ if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (sector_div(dev_size, pctx->pte_fields))
+ dev_size++;
+
+ if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ r = dm_set_target_max_io_len(ti, pctx->page_size);
+ if (r)
+ goto error_kfree;
+
+ pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
+ if (!pctx->page_table) {
+ ti->error = "Cannot allocate page table";
+ r = -ENOMEM;
+ goto error_kfree;
+ }
+
+ a = 0;
+ for (e = 0; e < pctx->n_pages; e++) {
+ switch_page_table_write(pctx, e, a);
+ a++;
+ if (a >= pctx->dev_count)
+ a = 0;
+ }
+
+ /*
+ * Check each device beneath the target to ensure that the limits are
+ * consistent.
+ */
+ for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
+ struct dm_dev *dm;
+ sector_t dev_size;
+ unsigned long long start;
+
+ if (kstrtoull(argv[a + 1], 10, &start) ||
+ start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ r = -EINVAL;
+ n--;
+ goto error_release_n;
+ }
+ r = dm_get_device
+ (ti, argv[a], dm_table_get_mode(ti->table), &dm);
+ if (r) {
+ ti->error = "Device lookup failed";
+ n--;
+ goto error_release_n;
+ }
+ pctx->dev_list[n].dmdev = dm;
+ pctx->dev_list[n].start = start;
+
+ dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (ti->len > start + dev_size) {
+ ti->error = "Device is too small";
+ r = -EINVAL;
+ goto error_release_n;
+ }
+ }
+
+ ti->private = pctx;
+
+ return 0;
+
+error_release_n: /* De-reference all devices */
+ for (; n >= 0; n--)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+error_kfree:
+ kfree(pctx);
+
+error:
+ return r;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ int n;
+ struct switch_ctx *pctx = ti->private;
+
+ for (n = 0; n < pctx->dev_count; n++)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+ kfree(pctx);
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct switch_ctx *pctx = ti->private;
+
+ sector_t offset = bio->bi_sector - ti->begin;
+ sector_t p;
+ unsigned long index;
+ unsigned bit, idev;
+
+ p = offset;
+ if (pctx->page_size_bits >= 0)
+ p >>= pctx->page_size_bits;
+ else
+ sector_div(p, pctx->page_size);
+
+ switch_get_position(pctx, p, &index, &bit);
+
+ idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) & ((1 << pctx->pte_size) - 1);
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(idev >= pctx->dev_count))
+ idev = 0;
+
+ bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
+ bio->bi_sector = pctx->dev_list[idev].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers as fast as possible.
+ * Message is used to load the whole table.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ * table-based parser condition-based parser
+ * PA-RISC 0.29s 0.31s
+ * Opteron 0.0495s 0.0498s
+ */
+
+static const unsigned char hex_table[256] = {
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255 ,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255 ,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255
+};
+
+static inline void parse_hex(const char *string, sector_t *result, const char **end)
+{
+ unsigned char d;
+ sector_t r = 0;
+#if 1
+ while ((d = hex_table[(unsigned char)*string]) < 16) {
+ r = (r << 4) | d;
+ string++;
+ }
+#else
+ while (1) {
+ d = *string;
+ if (d >= '0' && d <= '9')
+ d -= '0';
+ else if (d >= 'A' && d <= 'F')
+ d -= 'A' - 10;
+ else if (d >= 'a' && d <= 'f')
+ d -= 'a' - 10;
+ else
+ break;
+ r = (r << 4) | d;
+ string++;
+ }
+#endif
+ *end = string;
+ *result = r;
+}
+
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *pctx = ti->private;
+ int r;
+
+ mutex_lock(&message_mutex);
+
+ if (!argc) {
+ goto invalid_message;
+ } else if (!strcasecmp(argv[0], "set-table")) {
+ unsigned i;
+ sector_t table_index = 0;
+ for (i = 1; i < argc; i++) {
+ sector_t device;
+ const char *string = argv[i];
+ if (*string == ':')
+ table_index++;
+ else {
+ parse_hex(string, &table_index, &string);
+ if (unlikely(*string != ':')) {
+invalid_table:
+ DMWARN("invalid set-table argument");
+ r = -EINVAL;
+ goto ret;
+ }
+ }
+ string++;
+ if (unlikely(!*string))
+ goto invalid_table;
+ parse_hex(string, &device, &string);
+ if (unlikely(*string))
+ goto invalid_table;
+ if (unlikely(table_index >= pctx->n_pages)) {
+ DMWARN("invalid set-table page");
+ r = -EINVAL;
+ goto ret;
+ }
+ if (unlikely(device >= pctx->dev_count)) {
+ DMWARN("invalid set-table device");
+ r = -EINVAL;
+ goto ret;
+ }
+ switch_page_table_write(pctx, table_index, device);
+ }
+ r = 0;
+ } else {
+invalid_message:
+ DMWARN("unrecognised message received.");
+ r = -EINVAL;
+ }
+ret:
+ mutex_unlock(&message_mutex);
+ return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *pctx = ti->private;
+ unsigned sz = 0;
+ int n;
+
+ result[0] = '';
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = 0;
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
+ for (n = 0; n < pctx->dev_count; n++) {
+ DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
+ (unsigned long long)pctx->dev_list[n].start);
+ }
+ break;
+
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the first path.
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *pctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+
+ bdev = pctx->dev_list[0].dmdev->bdev;
+ mode = pctx->dev_list[0].dmdev->mode;
+
+ return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+};
+
+int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r) {
+ DMERR("dm_register_target() failed %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+void dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Mikulas Patocka 08-22-2012 01:04 AM

reworked dm-switch target
 
/* a sample program that makes a table of million entries and uploads it
via dm message */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <libdevmapper.h>

#define N_DEVICES 3
#define TABLE_SIZE 1000000

#define MSG_SIZE_LIMIT 45000
#define MSG_SIZE_RESERVED 20

static int table[TABLE_SIZE];
static char buffer[MSG_SIZE_LIMIT];

static void print_hex(char **s, unsigned hex)
{
char *p = *s;
int b;
for (b = 0; hex >> 4 >> b; b += 4) ;
do {
char c = (hex >> b) & 0xf;
if (c <= 9) c += '0';
else c += 'A' - 10;
*p++ = c;
} while ((b -= 4) >= 0);
*s = p;
}

int main(void)
{
int i, index;
int val;

val = time(NULL) % N_DEVICES;
for (i = 0; i < TABLE_SIZE; i++) {
table[i] = val;
if (++val >= N_DEVICES) val = 0;
}

index = 0;
while (index < TABLE_SIZE) {
struct dm_task *dmt;
char *ptr;
strcpy(buffer, "set-table ");
ptr = strchr(buffer, 0);
print_hex(&ptr, index);
while (index < TABLE_SIZE && ptr - buffer < MSG_SIZE_LIMIT - MSG_SIZE_RESERVED) {
*ptr++ = ':';
print_hex(&ptr, table[index]);
*ptr++ = ' ';
index++;
}
if (ptr - buffer > MSG_SIZE_LIMIT) {
fprintf(stderr, "buffer overflow
");
exit(1);
}
ptr[-1] = 0;

dmt = dm_task_create(DM_DEVICE_TARGET_MSG);
if (!dmt) fprintf(stderr, "dm_task_create failed
"), exit(1);
if (!dm_task_set_name(dmt, "switch")) fprintf(stderr, "dm_task_set_name failed
"), exit(1);
if (!dm_task_set_sector(dmt, 0)) fprintf(stderr, "dm_task_set_sector failed
"), exit(1);

if (!dm_task_set_message(dmt, buffer)) fprintf(stderr, "dm_task_set_message failed
"), exit(1);

if (!dm_task_run(dmt)) fprintf(stderr, "dm_task_run failed
"), exit(1);

dm_task_destroy(dmt);
}

return 0;
}

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


All times are GMT. The time now is 05:45 AM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.