This patch reduces GFS2 file fragmentation by pre-reserving blocks. The
resulting improved on disk layout greatly speeds up operations in cases
which would have resulted in interlaced allocation of blocks previously.
A typical example of this is 10 parallel dd processes, each writing to a
file in a common dirctory.
The implementation uses an rbtree of reservations attached to each
resource group (and each inode).
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a8..49cd7dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
+ if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+ gfs2_rs_deltree(ip->i_res);
+
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cb..9f94832 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
return ret;
+/* Resource group multi-block reservation, in order of appearance:
+
+ Step 1. Function prepares to write, allocates a mb, sets the size hint.
+ Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+ Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+ Step 4. Bits are assigned from the rgrp based on either the reservation
+ or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+ /* components used during write (step 1): */
+ atomic_t rs_sizehint; /* hint of the write size */
+
+ /* components used during inplace_reserve (step 2): */
+ u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+ /* components used during get_local_rgrp (step 3): */
+ struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */
+ struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+ struct rb_node rs_node; /* link to other block reservations */
+
+ /* components used during block searches and assignments (step 4): */
+ struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */
+ u32 rs_biblk; /* start block relative to the bi */
+ u32 rs_free; /* how many blocks are still free */
+
+ /* ancillary quota stuff */
+ struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+ struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+ unsigned int rs_qa_qd_num;
+};
+
enum {
GLF_LOCK = 1,
GLF_DEMOTE = 3,
@@ -290,16 +326,6 @@ struct gfs2_glock {
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
+ /* We need a reservation to allocate the new dinode block. The
+ directory ip temporarily points to the reservation, but this is
+ being done to get a set of contiguous blocks for the new dinode.
+ Since this is a create, we don't have a sizehint yet, so it will
+ have to use the minimum reservation size. */
error = gfs2_rs_alloc(dip);
if (error)
return error;
@@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto fail_gunlock2;
- error = gfs2_inode_refresh(GFS2_I(inode));
+ ip = GFS2_I(inode);
+ error = gfs2_inode_refresh(ip);
if (error)
goto fail_gunlock2;
- /* the new inode needs a reservation so it can allocate xattrs. */
- error = gfs2_rs_alloc(GFS2_I(inode));
- if (error)
- goto fail_gunlock2;
+ /* The newly created inode needs a reservation so it can allocate
+ xattrs. At the same time, we want new blocks allocated to the new
+ dinode to be as contiguous as possible. Since we allocated the
+ dinode block under the directory's reservation, we transfer
+ ownership of that reservation to the new inode. The directory
+ doesn't need a reservation unless it needs a new allocation. */
+ ip->i_res = dip->i_res;
+ dip->i_res = NULL;
error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock2;
/**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ * -1 if the block range is before the start of the reservation
+ * 0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+ u64 startblk = gfs2_rs_startblk(rs);
+
+ if (blk >= startblk + rs->rs_free)
+ return 1;
+ if (blk + len - 1 < startblk)
+ return -1;
+ return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+ struct rb_node **newn;
+ int rc;
+ u64 fsblk = rgblk + rgd->rd_data0;
+
+ spin_lock(&rgd->rd_rsspin);
+ newn = &rgd->rd_rstree.rb_node;
+ while (*newn) {
+ struct gfs2_blkreserv *cur =
+ rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+ rc = rs_cmp(fsblk, 1, cur);
+ if (rc < 0)
+ newn = &((*newn)->rb_left);
+ else if (rc > 0)
+ newn = &((*newn)->rb_right);
+ else {
+ spin_unlock(&rgd->rd_rsspin);
+ return cur;
+ }
+ }
+ spin_unlock(&rgd->rd_rsspin);
+ return NULL;
+}
+
+/**
* gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
* a block in a given allocation state.
* @buf: the buffer that holds the bitmaps
@@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
int gfs2_rs_alloc(struct gfs2_inode *ip)
{
int error = 0;
+ struct gfs2_blkreserv *res;
+
+ if (ip->i_res)
+ return 0;
+
+ res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+ if (!res)
+ error = -ENOMEM;
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+ gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u
",
+ rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
+ rs->rs_free);
+}
+
/**
- * gfs2_rs_delete - delete a reservation
+ * __rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __rs_deltree(struct gfs2_blkreserv *rs)
+{
+ struct gfs2_rgrpd *rgd;
+
+ if (!gfs2_rs_active(rs))
+ return;
+
+ rgd = rs->rs_rgd;
+ /* We can't do this: The reason is that when the rgrp is invalidated,
+ it's in the "middle" of acquiring the glock, but the HOLDER bit
+ isn't set yet:
+ BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+ trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
+
+ if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+ rb_erase(&rs->rs_node, &rgd->rd_rstree);
+ BUG_ON(!rgd->rd_rs_cnt);
+ rgd->rd_rs_cnt--;
+
+ if (rs->rs_free) {
+ /* return reserved blocks to the rgrp and the ip */
+ BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+ rs->rs_rgd->rd_reserved -= rs->rs_free;
+ rs->rs_free = 0;
+ clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+ smp_mb__after_clear_bit();
+ }
+ /* We can't change any of the step 1 or step 2 components of the rs.
+ E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+ dequeued through this pointer.
+ Can't: atomic_set(&rs->rs_sizehint, 0);
+ Can't: rs->rs_requested = 0;
+ Can't: rs->rs_rgd = NULL;*/
+ rs->rs_bi = NULL;
+ rs->rs_biblk = 0;
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+ struct gfs2_rgrpd *rgd;
+
+ if (!gfs2_rs_active(rs))
+ return;
+
+ rgd = rs->rs_rgd;
+ spin_lock(&rgd->rd_rsspin);
+ __rs_deltree(rs);
+ spin_unlock(&rgd->rd_rsspin);
+}
+
+/**
+ * gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
*
*/
@@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
{
down_write(&ip->i_rw_mutex);
if (ip->i_res) {
+ gfs2_rs_deltree(ip->i_res);
+ trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
+ BUG_ON(ip->i_res->rs_free);
kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
ip->i_res = NULL;
}
up_write(&ip->i_rw_mutex);
}
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+ struct rb_node *n;
+ struct gfs2_blkreserv *rs;
+
+ spin_lock(&rgd->rd_rsspin);
+ while ((n = rb_first(&rgd->rd_rstree))) {
+ rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+ __rs_deltree(rs);
+ }
+ spin_unlock(&rgd->rd_rsspin);
+}
+
void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
{
struct rb_node *n;
@@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+/* Since each block in the file system is represented by two bits in the
+ * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
+ * By reserving 32 blocks at a time, we can optimize / shortcut how we search
+ * through the bitmaps by looking a word at a time.
+ */
+#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+
struct gfs2_rgrpd;
struct gfs2_sbd;
struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
-/* This is how to tell if a reservation is "inplace" reserved: */
+/* This is how to tell if a multi-block reservation is "inplace" reserved: */
static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
{
if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
return 0;
}
+/* This is how to tell if a multi-block reservation is in the rgrp tree: */
+static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
+{
+ if (rs && rs->rs_bi)
+ return 1;
+ return 0;
+}
+
+static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
+{
+ return (bi->bi_start * GFS2_NBBY) + blk;
+}
+
+static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
+{
+ return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
+}
+
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7880687..b1502c4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
return -EIO;
}