This patch removes "io_lock" and "map_lock" in struct mapped_device and
"holders" in struct dm_table and replaces these mechanisms with
sleepable-rcu.
Previously, the code would call "dm_get_live_table" and "dm_table_put" to
get and release table. Now, the code is changed to call "dm_get_live_table"
and "dm_put_live_table". dm_get_live_table locks sleepable-rcu and
dm_put_live_table unlocks it.
dm_get_live_table_fast/dm_put_live_table_fast can be used instead of
dm_get_live_table/dm_put_live_table. These *_fast functions use
non-sleepable RCU, so the caller must not block between them.
If the code changes active or inactive dm table, it must call
dm_sync_table before destroying the old table.
/*
* duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_ren
/*
* Wake up any dm event waiters.
*/
- table = dm_get_live_table(hc->md);
- if (table) {
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
dm_table_event(table);
- dm_table_put(table);
- }
+ dm_put_live_table(hc->md, srcu_idx);
if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
* _hash_lock without first calling dm_table_put, because dm_table_destroy
* waits for this dm_table_put and could be called under this lock.
*/
-static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
struct hash_cell *hc;
struct dm_table *table = NULL;
+ /* increment rcu count, we don't care about the table pointer */
+ dm_get_live_table(md, srcu_idx);
+
down_read(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_
}
table = hc->new_map;
- if (table)
- dm_table_get(table);
+ /*
+ * Since dm_swap_table synchronizes RCU, nobody should be in
+ * read-side critical section already.
+ */
if (old_map)
dm_table_destroy(old_map);
@@ -1118,6 +1149,7 @@ static int dev_wait(struct dm_ioctl *par
int r = 0;
struct mapped_device *md;
struct dm_table *table;
+ int srcu_idx;
md = find_device(param);
if (!md)
@@ -1138,11 +1170,10 @@ static int dev_wait(struct dm_ioctl *par
*/
__dev_status(md, param);
@@ -1270,14 +1301,14 @@ static int table_load(struct dm_ioctl *p
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
- dm_table_destroy(t);
up_write(&_hash_lock);
+ dm_table_destroy(t);
r = -ENXIO;
goto out;
}
-/*
- * The table has always exactly one reference from either mapped_device->map
- * or hash_cell->new_map. This reference is not counted in table->holders.
- * A pair of dm_create_table/dm_destroy_table functions is used for table
- * creation/destruction.
- *
- * Temporary references from the other code increase table->holders. A pair
- * of dm_table_get/dm_table_put functions is used to manipulate it.
- *
- * When the table is about to be destroyed, we wait for table->holders to
- * drop to zero.
- */
-
struct dm_table {
struct mapped_device *md;
- atomic_t holders;
unsigned type;
return r;
}
@@ -541,20 +551,38 @@ static void queue_io(struct mapped_devic
/*
* Everyone (including functions in this file), should use this
* function to access the md->map field, and make sure they call
- * dm_table_put() when finished.
+ * dm_put_live_table() when finished.
*/
-struct dm_table *dm_get_live_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx)
{
- struct dm_table *t;
- unsigned long flags;
+ *srcu_idx = srcu_read_lock(&md->io_barrier);
+ return srcu_dereference(md->map, &md->io_barrier);
+}
- read_lock_irqsave(&md->map_lock, flags);
- t = md->map;
- if (t)
- dm_table_get(t);
- read_unlock_irqrestore(&md->map_lock, flags);
+void dm_put_live_table(struct mapped_device *md, int srcu_idx)
+{
+ srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
- return t;
+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}
+
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md)
+{
+ rcu_read_lock();
+ return rcu_dereference(md->map);
+}
+
+static void dm_put_live_table_fast(struct mapped_device *md)
+{
+ rcu_read_unlock();
}
/*
@@ -1298,17 +1326,18 @@ static int __clone_and_map(struct clone_
/*
* Split the bio into several clones and submit it to targets.
*/
-static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
{
struct clone_info ci;
int error = 0;
- ci.map = dm_get_live_table(md);
- if (unlikely(!ci.map)) {
+ if (unlikely(!map)) {
bio_io_error(bio);
return;
}
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
- up_read(&md->io_lock);
+ dm_put_live_table(md, srcu_idx);
+ /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+ dm_put_live_table(md, srcu_idx);
+
/*
* Rare, but there may be I/O requests still going to complete,
* for example. Wait for all references to disappear.
@@ -2296,7 +2329,6 @@ static void __dm_destroy(struct mapped_d
dm_device_name(md), atomic_read(&md->holders));
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2495,7 +2525,7 @@ int dm_suspend(struct mapped_device *md,
if (!noflush && do_lockfs) {
r = lock_fs(md);
if (r)
- goto out;
+ goto out_unlock;
}
/*
@@ -2510,9 +2540,8 @@ int dm_suspend(struct mapped_device *md,
* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
* flush_workqueue(md->wq).
*/
- down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- up_write(&md->io_lock);
+ synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
@@ -2530,10 +2559,9 @@ int dm_suspend(struct mapped_device *md,
*/
r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
- down_write(&md->io_lock);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- up_write(&md->io_lock);
+ synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
@@ -2543,7 +2571,7 @@ int dm_suspend(struct mapped_device *md,
start_queue(md->queue);
unlock_fs(md);
- goto out; /* pushback list is already flushed, so skip flush */
+ goto out_unlock; /* pushback list is already flushed, so skip flush */
}
/*
@@ -2556,9 +2584,6 @@ int dm_suspend(struct mapped_device *md,
dm_table_postsuspend_targets(map);
-out:
- dm_table_put(map);
-
out_unlock:
mutex_unlock(&md->suspend_lock);
return r;
@@ -2573,7 +2598,7 @@ int dm_resume(struct mapped_device *md)
if (!dm_suspended_md(md))
goto out;