FAQ Search Today's Posts Mark Forums Read
» Video Reviews

» Linux Archive

Linux-archive is a website aiming to archive linux email lists and to make them easily accessible for linux users/developers.


» Sponsor

» Partners

» Sponsor

Go Back   Linux Archive > Redhat > Cluster Development

 
 
LinkBack Thread Tools
 
Old 10-04-2011, 04:39 PM
Bob Peterson
 
Default GFS2: Add readahead to sequential directory traversal

Hi,

This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
--
fs/gfs2/dir.c | 33 +++++++++++++++++++++++++++++++++
1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..9b4262e 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1376,6 +1376,37 @@ out:
return error;
}

+static void gfs2_dir_readahead(struct inode *inode, __be64 *ht, unsigned hsize,
+ u32 index)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count = 0;
+
+ while (index < hsize) {
+ last = blocknr;
+ blocknr = be64_to_cpu(ht[index++]);
+ if (blocknr == last)
+ continue;
+ count++;
+ if (count > 128)
+ break;
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}

/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1406,6 +1437,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
if (IS_ERR(lp))
return PTR_ERR(lp);

+ gfs2_dir_readahead(inode, lp, hsize, index);
+
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
&copied, &depth,
 
Old 10-06-2011, 04:15 PM
Bob Peterson
 
Default GFS2: Add readahead to sequential directory traversal

----- Original Message -----
| Hi,
|
| Were we not intending to make this turn itself off in cases where it
| produces no benefit? I thought the plan was to track the state via
| the
| file descriptor in order to avoid readingahead the same blocks over
| and
| over again too,
|
| Steve.

Hi Steve,

Based on our discussion, I revised the patch to implement a
scheme whereby we don't duplicate read-ahead requests. However,
I decided against using the file descriptor in favor of a
simple bitmap that keeps track of where we've done read-ahead.

So now instead of a hash table in the inode, I have a structure
that contains two pointers: (1) hash table as before, and
(2) the bitmap to keep track of read-ahead requests.

I'm very happy with the results. I've got a file system with
500,000 files in a single directory. On my RHEL5 system, when
I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
get these times:

Stock pre-release 5.8: 0m34.481s
Previously posted patch: 0m18.811s
With this patch: 0m9.843s

which is more than three times faster, and nearly twice as
fast as the version I previously posted. I haven't tried my
"1000x1000" test on it yet.

The upstream version of the patch is given below.
There is one noticeable difference between RHEL5 and upstream:
I changed kmalloc to vmalloc. Not sure your feelings on that.
Also, I have a feeling you're going to ask me to remove the
#define MAX_RA_BLOCKS and hard-code it to 32. What can I say?

Regards,

Bob Peterson
Red Hat File Systems
--
commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
Author: Bob Peterson <rpeterso@redhat.com>
Date: Thu Oct 6 10:34:05 2011 -0500

GFS2: Add readahead to sequential directory traversal

This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..1c89ae0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -329,48 +329,58 @@ fail:
* gfs2_dir_get_hash_table - Get pointer to the dir hash table
* @ip: The inode in question
*
- * Returns: The hash table or an error
+ * Returns: an error code
*/

-static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
{
struct inode *inode = &ip->i_inode;
int ret;
- u32 hsize;
+ u32 hsize, rasize;
__be64 *hc;
+ u8 *ra;

BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));

- hc = ip->i_hash_cache;
- if (hc)
- return hc;
+ if (ip->i_hash_cache.h_ht)
+ return 0;

hsize = 1 << ip->i_depth;
+ rasize = hsize / 8; /* bit array */
hsize *= sizeof(__be64);
if (hsize != i_size_read(&ip->i_inode)) {
gfs2_consist_inode(ip);
- return ERR_PTR(-EIO);
+ return -EIO;
}

- hc = kmalloc(hsize, GFP_NOFS);
- ret = -ENOMEM;
+ hc = vmalloc(hsize);
if (hc == NULL)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
+
+ ra = vzalloc(rasize);
+ if (ra == NULL) {
+ vfree(hc);
+ return -ENOMEM;
+ }

ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
- kfree(hc);
- return ERR_PTR(ret);
+ vfree(hc);
+ vfree(ra);
+ return ret;
}

spin_lock(&inode->i_lock);
- if (ip->i_hash_cache)
- kfree(hc);
- else
- ip->i_hash_cache = hc;
+ if (ip->i_hash_cache.h_ht) {
+ vfree(hc);
+ vfree(ra);
+ } else {
+ ip->i_hash_cache.h_ht = hc;
+ ip->i_hash_cache.h_ra = ra;
+ }
spin_unlock(&inode->i_lock);

- return ip->i_hash_cache;
+ return 0;
}

/**
@@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
- ip->i_hash_cache = NULL;
- kfree(hc);
+ __be64 *hc = ip->i_hash_cache.h_ht;
+ u8 *h_ra = ip->i_hash_cache.h_ra;
+ ip->i_hash_cache.h_ht = NULL;
+ ip->i_hash_cache.h_ra = NULL;
+ vfree(hc);
+ vfree(h_ra);
}

static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
* Returns: 0 on success, error code otherwise
*/

-static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
- u64 *leaf_out)
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
{
+ int error;
__be64 *hash;

- hash = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hash))
- return PTR_ERR(hash);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+ hash = dip->i_hash_cache.h_ht;
*leaf_out = be64_to_cpu(*(hash + index));
return 0;
}
@@ -1097,27 +1111,35 @@ fail_brelse:
static int dir_double_exhash(struct gfs2_inode *dip)
{
struct buffer_head *dibh;
- u32 hsize;
+ u32 hsize, rasize;
u32 hsize_bytes;
- __be64 *hc;
- __be64 *hc2, *h;
+ __be64 *hc, *hc2, *h;
+ u8 *ra;
int x;
- int error = 0;
+ int error;

hsize = 1 << dip->i_depth;
+ rasize = hsize / 8; /* bit array */
hsize_bytes = hsize * sizeof(__be64);

- hc = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hc))
- return PTR_ERR(hc);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;

- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
- if (!hc2)
+ hc = dip->i_hash_cache.h_ht;
+ h = hc2 = vmalloc(hsize_bytes * 2);
+ if (h == NULL)
return -ENOMEM;

+ ra = vzalloc(rasize);
+ if (ra == NULL) {
+ error = -ENOMEM;
+ goto out_vfree;
+ }
+
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
- goto out_kfree;
+ goto out_rafree;

for (x = 0; x < hsize; x++) {
*h++ = *hc;
@@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
goto fail;

gfs2_dir_hash_inval(dip);
- dip->i_hash_cache = hc2;
+ dip->i_hash_cache.h_ht = hc2;
+ dip->i_hash_cache.h_ra = ra;
dip->i_depth++;
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
@@ -1142,8 +1165,11 @@ fail:
i_size_write(&dip->i_inode, hsize_bytes);
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
-out_kfree:
- kfree(hc2);
+
+out_rafree:
+ vfree(ra);
+out_vfree:
+ vfree(hc2);
return error;
}

@@ -1376,6 +1402,53 @@ out:
return error;
}

+#define MAX_RA_BLOCKS 32
+
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count = 0;
+ int ra_byte, ra_bit;
+
+ while (index < hsize) {
+ ra_byte = index / 8;
+ ra_bit = index % 8;
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
+ if (blocknr == last)
+ continue;
+ count++;
+ if (count > MAX_RA_BLOCKS)
+ break;
+
+ /* figure out if we've already requested this block */
+ if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
+ continue;
+
+ ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}

/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
filldir_t filldir)
{
struct gfs2_inode *dip = GFS2_I(inode);
- u32 hsize, len = 0;
+ u32 hsize, len;
u32 hash, index;
__be64 *lp;
int copied = 0;
- int error = 0;
+ int error;
unsigned depth = 0;

hsize = 1 << dip->i_depth;
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);

- lp = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(lp))
- return PTR_ERR(lp);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+
+ lp = dip->i_hash_cache.h_ht;
+ gfs2_dir_readahead(inode, hsize, index);

while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
@@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
u32 index = 0, next_index;
__be64 *lp;
u64 leaf_no;
- int error = 0, last;
+ int error, last;

hsize = 1 << dip->i_depth;
-
- lp = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(lp))
- return PTR_ERR(lp);
-
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+ lp = dip->i_hash_cache.h_ht;
while (index < hsize) {
leaf_no = be64_to_cpu(lp[index]);
if (leaf_no) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..731d763 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -271,6 +271,11 @@ enum {
};


+struct gfs2_hash_table {
+ __be64 *h_ht; /* hash table data */
+ u8 *h_ra; /* array indicating whether hash table block was read */
+};
+
struct gfs2_inode {
struct inode i_inode;
u64 i_no_addr;
@@ -285,7 +290,7 @@ struct gfs2_inode {
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
struct list_head i_trunc_list;
- __be64 *i_hash_cache;
+ struct gfs2_hash_table i_hash_cache;
u32 i_entries;
u32 i_diskflags;
u8 i_height;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff..a834b88 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
ip->i_alloc = NULL;
- ip->i_hash_cache = NULL;
+ ip->i_hash_cache.h_ht = NULL;
+ ip->i_hash_cache.h_ra = NULL;
}

static void gfs2_init_glock_once(void *foo)
 
Old 10-21-2011, 04:53 PM
Bob Peterson
 
Default GFS2: Add readahead to sequential directory traversal

----- Original Message -----
| Well it is not called directly from NFS though, only from our getname
| function. I'd suggest we do something along the following lines:
|
| Create a new structure, something like this:
|
| struct gfs2_dir_ra_state {
| /* Readahead state variables... */
| };
|
| and then pass that to gfs2_dir_read() in addition to passing the
| offset.
| The getname call is trivial, since it always starts at the beginning
| and
| reads through sequentially, so it just needs a gfs2_dir_ra_state to
| be
| set up to take account of that. The struct gfs2_dir_ra_state can be
| embedded into the struct gfs2_file, so that gfs2_readdir() just
| passes a
| pointer to that.
|
| That will allow us to reset the readahead state upon lseek(SEEK_SET,
| 0);
| and also flag whether the accesses become non-sequential.
|
| So I think keeping the state in the struct file and passing it to our
| gfs2_file_read() function should not be too tricky,
|
| Steve.

Hi Steve,

Thanks for the feedback. How about something like this (follows):
Instead of creating a new read-ahead structure gfs2_dir_ra_state
I'm making (non-standard) use of vfs's read-ahead struct in the
file.

Regards,

Bob Peterson
Red Hat File System
--
commit 14e64999c89c7e2801d69b9e877f2fa0bf10ad70
Author: Bob Peterson <rpeterso@redhat.com>
Date: Fri Oct 7 11:55:31 2011 -0500

GFS2: Add readahead to sequential directory traversal

This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..30405d4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
#define IS_LEAF 1 /* Hashed (leaf) directory */
#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */

+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))

@@ -1376,6 +1378,50 @@ out:
return error;
}

+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+ struct file_ra_state *f_ra)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count;
+
+ /* First check if we've already read-ahead for the whole range. */
+ if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
+ return;
+
+ f_ra->start = max((pgoff_t)index, f_ra->start);
+ for (count = 0; count < MAX_RA_BLOCKS; count++) {
+ if (f_ra->start >= hsize) /* if exceeded the hash table */
+ break;
+
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+ f_ra->start++;
+ if (blocknr == last)
+ continue;
+
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}

/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1434,7 @@ out:
*/

static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);

+ if (f_ra && dip->i_hash_cache == NULL)
+ f_ra->start = 0;
lp = gfs2_dir_get_hash_table(dip);
if (IS_ERR(lp))
return PTR_ERR(lp);

+ gfs2_dir_readahead(inode, hsize, index, f_ra);
+
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
&copied, &depth,
@@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
}

int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;

if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir);
+ return dir_e_read(inode, offset, opaque, filldir, f_ra);

if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772f..98c960b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir);
+ filldir_t filldir, struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f..a581cd2 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -118,7 +118,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
if (error)
return error;

- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);

gfs2_glock_dq_uninit(&gh);

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 92c3db4..c9cbb5f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
return error;
}

- error = gfs2_dir_read(dir, &offset, dirent, filldir);
+ error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);

gfs2_glock_dq_uninit(&d_gh);
 
Old 10-24-2011, 07:56 PM
Bob Peterson
 
Default GFS2: Add readahead to sequential directory traversal

----- Original Message -----
| Hi,
|
| That looks much better, but why not create & pass a file_ra_state in
| the
| NFS getname function so that you don't have to check for it being
| NULL?
| Since that function always reads everything up until the name it is
| looking for, it is an ideal place to do readahead as the access will
| always be sequential.
|
| Otherwise, it looks good to me,
|
| Steve.

Hi,

There isn't a file struct passed in to the nfs get_name function, but
I can do something like the following. Is this what you mean?

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index a581cd2..70ba891 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct gfs2_holder gh;
u64 offset = 0;
int error;
+ struct file_ra_state f_ra = { .start = 0 };

if (!dir)
return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
if (error)
return error;

- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);

gfs2_glock_dq_uninit(&gh);

Regards,

Bob Peterson
Red Hat File Systems
 
Old 10-27-2011, 04:16 PM
Bob Peterson
 
Default GFS2: Add readahead to sequential directory traversal

----- Original Message -----
| Hi,
|
| > Hi,
| >
| > There isn't a file struct passed in to the nfs get_name function,
| > but
| > I can do something like the following. Is this what you mean?
| >
| > diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
| > index a581cd2..70ba891 100644
| > --- a/fs/gfs2/export.c
| > +++ b/fs/gfs2/export.c
| > @@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent,
| > char *name,
| > struct gfs2_holder gh;
| > u64 offset = 0;
| > int error;
| > + struct file_ra_state f_ra = { .start = 0 };
| >
| > if (!dir)
| > return -EINVAL;
| > @@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent,
| > char *name,
| > if (error)
| > return error;
| >
| > - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir,
| > NULL);
| > + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir,
| > &f_ra);
| >
| > gfs2_glock_dq_uninit(&gh);
| >
| > Regards,
| >
| > Bob Peterson
| > Red Hat File Systems
|
| Yes, thats the kind of things. Looks like we can use
| file_ra_state_init() since it is exported too,
|
| Steve.

Hi,

I'd rather not call file_ra_state_init in this case.
Here is my latest patch for upstream GFS2 then.

Regards,

Bob Peterson
Red Hat File Systems
--
commit 17c4434902790347b4dd6abfdc158aa5a443f91b
Author: Bob Peterson <rpeterso@redhat.com>
Date: Fri Oct 7 11:55:31 2011 -0500

GFS2: Add readahead to sequential directory traversal

This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..30405d4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
#define IS_LEAF 1 /* Hashed (leaf) directory */
#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */

+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))

@@ -1376,6 +1378,50 @@ out:
return error;
}

+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+ struct file_ra_state *f_ra)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count;
+
+ /* First check if we've already read-ahead for the whole range. */
+ if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
+ return;
+
+ f_ra->start = max((pgoff_t)index, f_ra->start);
+ for (count = 0; count < MAX_RA_BLOCKS; count++) {
+ if (f_ra->start >= hsize) /* if exceeded the hash table */
+ break;
+
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+ f_ra->start++;
+ if (blocknr == last)
+ continue;
+
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}

/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1434,7 @@ out:
*/

static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);

+ if (f_ra && dip->i_hash_cache == NULL)
+ f_ra->start = 0;
lp = gfs2_dir_get_hash_table(dip);
if (IS_ERR(lp))
return PTR_ERR(lp);

+ gfs2_dir_readahead(inode, hsize, index, f_ra);
+
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
&copied, &depth,
@@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
}

int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;

if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir);
+ return dir_e_read(inode, offset, opaque, filldir, f_ra);

if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772f..98c960b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir);
+ filldir_t filldir, struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f..70ba891 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct gfs2_holder gh;
u64 offset = 0;
int error;
+ struct file_ra_state f_ra = { .start = 0 };

if (!dir)
return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
if (error)
return error;

- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);

gfs2_glock_dq_uninit(&gh);

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 92c3db4..c9cbb5f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
return error;
}

- error = gfs2_dir_read(dir, &offset, dirent, filldir);
+ error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);

gfs2_glock_dq_uninit(&d_gh);
 
Old 01-05-2012, 10:51 AM
Steven Whitehouse
 
Default GFS2: Add readahead to sequential directory traversal

From: Bob Peterson <rpeterso@redhat.com>

This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8ccad24..9144117 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
#define IS_LEAF 1 /* Hashed (leaf) directory */
#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */

+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))

@@ -1376,6 +1378,50 @@ out:
return error;
}

+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+ struct file_ra_state *f_ra)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count;
+
+ /* First check if we've already read-ahead for the whole range. */
+ if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
+ return;
+
+ f_ra->start = max((pgoff_t)index, f_ra->start);
+ for (count = 0; count < MAX_RA_BLOCKS; count++) {
+ if (f_ra->start >= hsize) /* if exceeded the hash table */
+ break;
+
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+ f_ra->start++;
+ if (blocknr == last)
+ continue;
+
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}

/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1434,7 @@ out:
*/

static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
@@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);

+ if (f_ra && dip->i_hash_cache == NULL)
+ f_ra->start = 0;
lp = gfs2_dir_get_hash_table(dip);
if (IS_ERR(lp))
return PTR_ERR(lp);

+ gfs2_dir_readahead(inode, hsize, index, f_ra);
+
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
&copied, &depth,
@@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
}

int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir)
+ filldir_t filldir, struct file_ra_state *f_ra)
{
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
return 0;

if (dip->i_diskflags & GFS2_DIF_EXHASH)
- return dir_e_read(inode, offset, opaque, filldir);
+ return dir_e_read(inode, offset, opaque, filldir, f_ra);

if (!gfs2_is_stuffed(dip)) {
gfs2_consist_inode(dip);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772f..98c960b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir);
+ filldir_t filldir, struct file_ra_state *f_ra);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
const struct gfs2_inode *nip, unsigned int new_type);

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f..70ba891 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
struct gfs2_holder gh;
u64 offset = 0;
int error;
+ struct file_ra_state f_ra = { .start = 0 };

if (!dir)
return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
if (error)
return error;

- error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);

gfs2_glock_dq_uninit(&gh);

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ce36a56..46f6f9a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -105,7 +105,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
return error;
}

- error = gfs2_dir_read(dir, &offset, dirent, filldir);
+ error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);

gfs2_glock_dq_uninit(&d_gh);

--
1.7.4
 

Thread Tools




All times are GMT. The time now is 08:58 AM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.
Copyright 2007 - 2008, www.linux-archive.org