When umount calls sync_filesystem(), we first do a WB_SYNC_NONE
writeback to kick off writeback of pending dirty inodes, then follow
that up with a WB_SYNC_ALL to wait for it. Since umount already holds
the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all
writeback happens as WB_SYNC_ALL. This can greatly slow down umount,
since WB_SYNC_ALL writeback is a data integrity operation and thus
a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems
it's a lot slower.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bff5f77..3ec332d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -44,6 +44,7 @@ struct wb_writeback_args {
int for_kupdate:1;
int range_cyclic:1;
int for_background:1;
+ int sb_pinned:1;
};
/*
@@ -229,6 +230,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
+ /*
+ * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+ * lets make it explicitly clear.
+ */
+ .sb_pinned = 1,
};
struct bdi_work work;
@@ -243,21 +249,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
* bdi_start_writeback - start writeback
* @bdi: the backing device to write from
* @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
*
* Description:
* This does WB_SYNC_NONE opportunistic writeback. The IO is only
* started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * completion. Caller specifies whether sb umount sem is held already or not.
*
*/
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages)
+ long nr_pages, int sb_locked)
{
struct wb_writeback_args args = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
.nr_pages = nr_pages,
.range_cyclic = 1,
+ .sb_pinned = sb_locked,
};
/*
@@ -584,7 +592,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
/*
* Caller must already hold the ref for this
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
WARN_ON(!rwsem_is_locked(&sb->s_umount));
return 0;
}
@@ -757,6 +765,7 @@ static long wb_writeback(struct bdi_writeback *wb,
.older_than_this = NULL,
.for_kupdate = args->for_kupdate,
.range_cyclic = args->range_cyclic,
+ .sb_pinned = args->sb_pinned,
};
unsigned long oldest_jif;
long wrote = 0;
@@ -1190,6 +1199,18 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
/**
+ * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+ __writeback_inodes_sb(sb, 1);
+}
+
+/**
* writeback_inodes_sb_if_idle - start writeback if none underway
* @sb: the superblock
*
diff --git a/fs/sync.c b/fs/sync.c
index d104591..8932a3e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -37,7 +37,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
/* Avoid doing twice syncing and cache pruning for quota sync */
if (!wait) {
writeout_quota_sb(sb, -1);
- writeback_inodes_sb(sb);
+ writeback_inodes_sb_locked(sb);
} else {
sync_quota_sb(sb, -1);
sync_inodes_sb(sb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e73..f257a23 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -102,7 +102,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
void bdi_unregister(struct backing_dev_info *bdi);
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages);
+ long nr_pages, int sb_locked);
int bdi_writeback_task(struct bdi_writeback *wb);
int bdi_has_dirty_io(struct backing_dev_info *bdi);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dc52482..b62f517 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -61,6 +61,15 @@ struct writeback_control {
* so we use a single control to update them
*/
unsigned no_nrwrite_index_update:1;
+
+ /*
+ * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+ * the writeback code will pin the sb for the caller. However,
+ * for eg umount, the caller does WB_SYNC_NONE but already has
+ * the sb pinned. If the below is set, caller already has the
+ * sb pinned.
+ */
+ unsigned sb_pinned:1;
};