Linux Archive

Linux Archive (http://www.linux-archive.org/)
-   Device-mapper Development (http://www.linux-archive.org/device-mapper-development/)
-   -   dm-switch target (http://www.linux-archive.org/device-mapper-development/707349-dm-switch-target.html)

Mikulas Patocka 09-25-2012 09:50 PM

dm-switch target
 
Hi

This is the dm-switch target to be included in the next kernel.

It is equivalent to the last code sent by Jim Ramsay with the exception
that REQ_FLUSH processing was removed (because hardware has no write-back
cache).

Mikulas

---

dm-switch target

Originally developed by Jim Ramsay. Simplified by Mikulas Patocka.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jim Ramsay <jim_ramsay@dell.com>

---
drivers/md/Kconfig | 11 +
drivers/md/Makefile | 1
drivers/md/dm-switch.c | 520 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 532 insertions(+)

Index: linux-3.5.4-fast/drivers/md/Kconfig
================================================== =================
--- linux-3.5.4-fast.orig/drivers/md/Kconfig 2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Kconfig 2012-09-25 22:21:56.000000000 +0200
@@ -417,4 +417,15 @@ config DM_VERITY2

source "drivers/md/enhanceio/Kconfig"

+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Help text needs writing
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
Index: linux-3.5.4-fast/drivers/md/Makefile
================================================== =================
--- linux-3.5.4-fast.orig/drivers/md/Makefile 2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Makefile 2012-09-25 22:21:56.000000000 +0200
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o
obj-$(CONFIG_DM_ENHANCEIO) += enhanceio/
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o

ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
Index: linux-3.5.4-fast/drivers/md/dm-switch.c
================================================== =================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.5.4-fast/drivers/md/dm-switch.c 2012-09-25 22:21:53.000000000 +0200
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2010-2012 by Dell Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ * file: dm-switch.c
+ * authors: Kevin_OKelley@dell.com
+ * Jim_Ramsay@dell.com
+ * Narendran_Ganapathy@dell.com
+ * mpatocka@redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ * (1) a large number of address regions
+ * (2) a fixed size equal across all address regions
+ * (3) no pattern than allows for a compact description with something like
+ * the dm-stripe target.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device-mapper.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+typedef unsigned long pt_entry;
+
+/* Switch context header */
+struct switch_ctx {
+ unsigned dev_count; /* Number of devices */
+ unsigned page_size; /* Page size in 512B sectors */
+ unsigned long n_pages; /* Number of pages */
+ signed char page_size_bits; /* log2 of page_size or -1 */
+
+ unsigned char pte_size; /* Page table entry size in bits */
+ unsigned char pte_fields; /* Number of entries per pt_entry */
+ signed char pte_fields_bits; /* log2 of pte_fields or -1 */
+ pt_entry *page_table; /* Page table */
+
+ /* Array of dm devices to switch between */
+ struct switch_dev dev_list[0];
+};
+
+static inline void switch_get_position(struct switch_ctx *pctx,
+ unsigned long page,
+ unsigned long *index,
+ unsigned *bit)
+
+{
+ if (pctx->pte_fields_bits >= 0) {
+ *index = page >> pctx->pte_fields_bits;
+ *bit = page & (pctx->pte_fields - 1);
+ } else {
+ *index = page / pctx->pte_fields;
+ *bit = page % pctx->pte_fields;
+ }
+ *bit *= pctx->pte_size;
+
+}
+
+static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx,
+ sector_t sector)
+{
+ unsigned long index;
+ unsigned bit, idev;
+ sector_t p;
+
+ p = sector;
+ if (pctx->page_size_bits >= 0)
+ p >>= pctx->page_size_bits;
+ else
+ sector_div(p, pctx->page_size);
+
+ switch_get_position(pctx, p, &index, &bit);
+ idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) &
+ ((1 << pctx->pte_size) - 1);
+
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(idev >= pctx->dev_count))
+ idev = 0;
+
+ return idev;
+}
+
+static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
+ unsigned value)
+{
+ unsigned long index;
+ unsigned bit;
+ pt_entry pte;
+
+ switch_get_position(pctx, page, &index, &bit);
+
+ pte = pctx->page_table[index];
+ pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
+ pte |= (pt_entry)value << bit;
+ pctx->page_table[index] = pte;
+}
+
+/*
+ * Constructor: Called each time a dmsetup command creates a dm device. The
+ * target parameter will already have the table, type, begin and len fields
+ * filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get
+ * multiple constructor calls, but we will need to build a list of switch_ctx
+ * blocks so that the page table information gets matched to the correct
+ * device.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ unsigned a;
+ int n;
+ int r;
+ unsigned dev_count;
+ struct switch_ctx *pctx;
+ sector_t dev_size;
+ unsigned long e;
+
+ if (argc < 4) {
+ ti->error = "Insufficient arguments";
+ r = -EINVAL;
+ goto error;
+ }
+ if (kstrtouint(argv[0], 10, &dev_count) ||
+ !dev_count ||
+ dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+ ti->error = "Invalid device count";
+ r = -EINVAL;
+ goto error;
+ }
+ if (dev_count != (argc - 2) / 2) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto error;
+ }
+ pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
+ GFP_KERNEL);
+ if (!pctx) {
+ ti->error = "Cannot allocate redirect context";
+ r = -ENOMEM;
+ goto error;
+ }
+ pctx->dev_count = dev_count;
+ if (kstrtouint(argv[1], 10, &pctx->page_size) ||
+ !pctx->page_size) {
+ ti->error = "Invalid page size";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (!(pctx->page_size & (pctx->page_size - 1)))
+ pctx->page_size_bits = __ffs(pctx->page_size);
+ else
+ pctx->page_size_bits = -1;
+
+ pctx->pte_size = 1;
+ while (pctx->pte_size < sizeof(pt_entry) * 8 &&
+ (pt_entry)1 << pctx->pte_size < pctx->dev_count)
+ pctx->pte_size++;
+
+ pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
+ if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
+ pctx->pte_fields_bits = __ffs(pctx->pte_fields);
+ else
+ pctx->pte_fields_bits = -1;
+
+ dev_size = ti->len;
+ if (sector_div(dev_size, pctx->page_size))
+ dev_size++;
+
+ pctx->n_pages = dev_size;
+ if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (sector_div(dev_size, pctx->pte_fields))
+ dev_size++;
+
+ if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ r = dm_set_target_max_io_len(ti, pctx->page_size);
+ if (r)
+ goto error_kfree;
+
+ pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
+ if (!pctx->page_table) {
+ ti->error = "Cannot allocate page table";
+ r = -ENOMEM;
+ goto error_kfree;
+ }
+
+ a = 0;
+ for (e = 0; e < pctx->n_pages; e++) {
+ switch_page_table_write(pctx, e, a);
+ a++;
+ if (a >= pctx->dev_count)
+ a = 0;
+ }
+
+ /*
+ * Check each device beneath the target to ensure that the limits are
+ * consistent.
+ */
+ for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
+ struct dm_dev *dm;
+ sector_t dev_size;
+ unsigned long long start;
+
+ if (kstrtoull(argv[a + 1], 10, &start) ||
+ start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ r = -EINVAL;
+ n--;
+ goto error_release_n;
+ }
+ r = dm_get_device
+ (ti, argv[a], dm_table_get_mode(ti->table), &dm);
+ if (r) {
+ ti->error = "Device lookup failed";
+ n--;
+ goto error_release_n;
+ }
+ pctx->dev_list[n].dmdev = dm;
+ pctx->dev_list[n].start = start;
+
+ dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (ti->len > start + dev_size) {
+ ti->error = "Device is too small";
+ r = -EINVAL;
+ goto error_release_n;
+ }
+ }
+
+ /* For UNMAP, sending the request down any path is sufficient */
+ ti->num_discard_requests = 1;
+
+ ti->private = pctx;
+
+ return 0;
+
+error_release_n: /* De-reference all devices */
+ for (; n >= 0; n--)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+error_kfree:
+ kfree(pctx);
+
+error:
+ return r;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ int n;
+ struct switch_ctx *pctx = ti->private;
+
+ for (n = 0; n < pctx->dev_count; n++)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+ kfree(pctx);
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct switch_ctx *pctx = ti->private;
+
+ sector_t offset = bio->bi_sector - ti->begin;
+ unsigned idev;
+
+ idev = switch_get_deviceidx(pctx, offset);
+
+ bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
+ bio->bi_sector = pctx->dev_list[idev].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers as fast as possible.
+ * Message is used to load the whole table.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ * table-based parser condition-based parser
+ * PA-RISC 0.29s 0.31s
+ * Opteron 0.0495s 0.0498s
+ */
+
+static const unsigned char hex_table[256] = {
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255 ,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255 ,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,2 55,255,255,255
+};
+
+static inline void parse_hex(const char *string, sector_t *result, const char **end)
+{
+ unsigned char d;
+ sector_t r = 0;
+#if 1
+ while ((d = hex_table[(unsigned char)*string]) < 16) {
+ r = (r << 4) | d;
+ string++;
+ }
+#else
+ while (1) {
+ d = *string;
+ if (d >= '0' && d <= '9')
+ d -= '0';
+ else if (d >= 'A' && d <= 'F')
+ d -= 'A' - 10;
+ else if (d >= 'a' && d <= 'f')
+ d -= 'a' - 10;
+ else
+ break;
+ r = (r << 4) | d;
+ string++;
+ }
+#endif
+ *end = string;
+ *result = r;
+}
+
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *pctx = ti->private;
+ int r;
+
+ mutex_lock(&message_mutex);
+
+ if (!argc) {
+ goto invalid_message;
+ } else if (!strcasecmp(argv[0], "set-table")) {
+ unsigned i;
+ sector_t table_index = 0;
+ for (i = 1; i < argc; i++) {
+ sector_t device;
+ const char *string = argv[i];
+ if (*string == ':')
+ table_index++;
+ else {
+ parse_hex(string, &table_index, &string);
+ if (unlikely(*string != ':')) {
+invalid_table:
+ DMWARN("invalid set-table argument");
+ r = -EINVAL;
+ goto ret;
+ }
+ }
+ string++;
+ if (unlikely(!*string))
+ goto invalid_table;
+ parse_hex(string, &device, &string);
+ if (unlikely(*string))
+ goto invalid_table;
+ if (unlikely(table_index >= pctx->n_pages)) {
+ DMWARN("invalid set-table page");
+ r = -EINVAL;
+ goto ret;
+ }
+ if (unlikely(device >= pctx->dev_count)) {
+ DMWARN("invalid set-table device");
+ r = -EINVAL;
+ goto ret;
+ }
+ switch_page_table_write(pctx, table_index, device);
+ }
+ r = 0;
+ } else {
+invalid_message:
+ DMWARN("unrecognised message received.");
+ r = -EINVAL;
+ }
+ret:
+ mutex_unlock(&message_mutex);
+ return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *pctx = ti->private;
+ unsigned sz = 0;
+ int n;
+
+ result[0] = '';
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = 0;
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
+ for (n = 0; n < pctx->dev_count; n++) {
+ DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
+ (unsigned long long)pctx->dev_list[n].start);
+ }
+ break;
+
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *pctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+ unsigned idev;
+
+ idev = switch_get_deviceidx(pctx, 0);
+
+ bdev = pctx->dev_list[idev].dmdev->bdev;
+ mode = pctx->dev_list[idev].dmdev->mode;
+
+ return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct switch_ctx *pctx = (struct switch_ctx *)ti->private;
+ int n, ret = 0;
+
+ for (n = 0; n < pctx->dev_count; n++) {
+ ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+ .iterate_devices = switch_iterate_devices,
+};
+
+int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r) {
+ DMERR("dm_register_target() failed %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+void dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Alasdair G Kergon 09-25-2012 11:10 PM

dm-switch target
 
On Tue, Sep 25, 2012 at 05:50:31PM -0400, Mikulas Patocka wrote:
> drivers/md/Kconfig | 11 +
> drivers/md/Makefile | 1
> drivers/md/dm-switch.c | 520 +++++++++++++++++++++++++++++++++++++++++++++++++

Please would one of you also propose a Documentation/device-mapper/switch.txt file?

Alasdair

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Mikulas Patocka 09-25-2012 11:35 PM

dm-switch target
 
On Wed, 26 Sep 2012, Alasdair G Kergon wrote:

> On Tue, Sep 25, 2012 at 05:50:31PM -0400, Mikulas Patocka wrote:
> > drivers/md/Kconfig | 11 +
> > drivers/md/Makefile | 1
> > drivers/md/dm-switch.c | 520 +++++++++++++++++++++++++++++++++++++++++++++++++
>
> Please would one of you also propose a Documentation/device-mapper/switch.txt file?
>
> Alasdair

Here it is:

dm-switch target is suitable for Dell EqualLogic storage system.

The EqualLogic storage consists of several nodes. Each host is connected
to each node. The host may send I/O requests to any node, the node that
received the requests forwards it to the node where the data is stored.

However, there is a performance advantage of sending I/O requests to the
node where the data is stored to avoid forwarding. The dm-switch targets
is created to use this performance advantage.

The dm-switch target splits the device to fixed-size pages. It maintains
a page table that maps pages to storage nodes. Every request is
forwarded to the corresponding storage node specified in the page table.
The table may be changed with messages while the dm-switch target is
running.

DM table arguments:
- number of storage nodes
- page size
for every storage node:
- the underlying block device
- offset to the start of data in 512-byte sectors

DM message:
set-table index1:node1 index2:node2 index3:node3 ...
- modify page table, set values at index to point to the specific node.
Index and node numbers are hexadecimal. You can omit the index number,
in this case previous index plus 1 is used.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


All times are GMT. The time now is 06:53 AM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.