FAQ Search Today's Posts Mark Forums Read
» Video Reviews

» Linux Archive

Linux-archive is a website aiming to archive linux email lists and to make them easily accessible for linux users/developers.


» Sponsor

» Partners

» Sponsor

Go Back   Linux Archive > Redhat > Cluster Development

 
 
LinkBack Thread Tools
 
Old 03-17-2011, 04:08 AM
Benjamin Marzinski
 
Default GFS2: directly write blocks past i_size

GFS2 was relying on the writepage code to write out the zeroed data for
fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
If it is, it will be ignored. To work around this, gfs2 now calls
write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
is set, and it's writing past i_size.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
fs/gfs2/file.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 52 insertions(+), 10 deletions(-)

Index: gfs2-2.6-nmw/fs/gfs2/file.c
================================================== =================
--- gfs2-2.6-nmw.orig/fs/gfs2/file.c
+++ gfs2-2.6-nmw/fs/gfs2/file.c
@@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}

-static void empty_write_end(struct page *page, unsigned from,
- unsigned to)
+static int empty_write_end(struct page *page, unsigned from,
+ unsigned to, int mode)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ int waiting = 0;
+ unsigned offset, blksize = 1 << inode->i_blkbits;
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

zero_user(page, from, to-from);
mark_page_accessed(page);

- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
+ if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (!gfs2_is_writeback(ip))
+ gfs2_page_add_databufs(ip, page, from, to);

- block_commit_write(page, from, to);
+ block_commit_write(page, from, to);
+ return 0;
+ }
+
+second_pass:
+ offset = 0;
+ bh = page_buffers(page);
+ while (offset < from) {
+ offset += blksize;
+ bh = bh->b_this_page;
+ }
+ while (offset < to) {
+ if (waiting){
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ return -EIO;
+ }
+ else {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ clear_buffer_new(bh);
+ write_dirty_buffer(bh, WRITE);
+ }
+ offset += blksize;
+ bh = bh->b_this_page;
+ }
+ if (!waiting) {
+ waiting = 1;
+ goto second_pass;
+ }
+ return 0;
}

static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
return !buffer_mapped(&bh_map);
}

-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+ int mode)
{
struct inode *inode = page->mapping->host;
unsigned start, end, next, blksize;
@@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
end = 0;
}
start = next;
@@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
ret = __block_write_begin(page, start, end - start, gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
}

return 0;
@@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode

if (curr == end)
to = end_offset;
- error = write_empty_blocks(page, from, to);
+ error = write_empty_blocks(page, from, to, mode);
if (!error && offset + to > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
i_size_write(inode, offset + to);
 
Old 03-17-2011, 08:26 AM
Steven Whitehouse
 
Default GFS2: directly write blocks past i_size

Hi,

On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> GFS2 was relying on the writepage code to write out the zeroed data for
> fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> If it is, it will be ignored. To work around this, gfs2 now calls
> write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> is set, and it's writing past i_size.
>
> Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> ---
> fs/gfs2/file.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 52 insertions(+), 10 deletions(-)
>
Generally looks good, but a few minor points....

> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ================================================== =================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
> return generic_file_aio_write(iocb, iov, nr_segs, pos);
> }
>
> -static void empty_write_end(struct page *page, unsigned from,
> - unsigned to)
> +static int empty_write_end(struct page *page, unsigned from,
> + unsigned to, int mode)
> {
> - struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> + struct inode *inode = page->mapping->host;
> + struct gfs2_inode *ip = GFS2_I(inode);
> + struct buffer_head *bh;
> + int waiting = 0;
> + unsigned offset, blksize = 1 << inode->i_blkbits;
> + loff_t i_size = i_size_read(inode);
> + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
>
> zero_user(page, from, to-from);
> mark_page_accessed(page);
>
> - if (!gfs2_is_writeback(ip))
> - gfs2_page_add_databufs(ip, page, from, to);
> + if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> + if (!gfs2_is_writeback(ip))
> + gfs2_page_add_databufs(ip, page, from, to);
>
> - block_commit_write(page, from, to);
> + block_commit_write(page, from, to);
> + return 0;
> + }
> +
> +second_pass:
> + offset = 0;
> + bh = page_buffers(page);
> + while (offset < from) {
> + offset += blksize;
> + bh = bh->b_this_page;
> + }
> + while (offset < to) {
> + if (waiting){
> + wait_on_buffer(bh);
> + if (!buffer_uptodate(bh))
> + return -EIO;
> + }
> + else {
The else should start on the line above to make it more obvious.

> + set_buffer_uptodate(bh);
> + mark_buffer_dirty(bh);
> + clear_buffer_new(bh);
> + write_dirty_buffer(bh, WRITE);
Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?

> + }
> + offset += blksize;
> + bh = bh->b_this_page;
> + }
> + if (!waiting) {
> + waiting = 1;
> + goto second_pass;
> + }
I think the code might be a bit cleaner if it was just written as two
loops, one after the other since most of the loop content seems to be
different according to weather "waiting" is set or not.

Otherwise I think this is a good solution,

Steve.

> + return 0;
> }
>
> static int needs_empty_write(sector_t block, struct inode *inode)
> @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> return !buffer_mapped(&bh_map);
> }
>
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> + int mode)
> {
> struct inode *inode = page->mapping->host;
> unsigned start, end, next, blksize;
> @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> gfs2_block_map);
> if (unlikely(ret))
> return ret;
> - empty_write_end(page, start, end);
> + ret = empty_write_end(page, start, end, mode);
> + if (unlikely(ret))
> + return ret;
> end = 0;
> }
> start = next;
> @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> if (unlikely(ret))
> return ret;
> - empty_write_end(page, start, end);
> + ret = empty_write_end(page, start, end, mode);
> + if (unlikely(ret))
> + return ret;
> }
>
> return 0;
> @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode
>
> if (curr == end)
> to = end_offset;
> - error = write_empty_blocks(page, from, to);
> + error = write_empty_blocks(page, from, to, mode);
> if (!error && offset + to > inode->i_size &&
> !(mode & FALLOC_FL_KEEP_SIZE)) {
> i_size_write(inode, offset + to);
>
 
Old 03-17-2011, 01:05 PM
Benjamin Marzinski
 
Default GFS2: directly write blocks past i_size

On Thu, Mar 17, 2011 at 09:26:08AM +0000, Steven Whitehouse wrote:
> Hi,
>
> On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> > GFS2 was relying on the writepage code to write out the zeroed data for
> > fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> > If it is, it will be ignored. To work around this, gfs2 now calls
> > write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> > is set, and it's writing past i_size.
> >
> > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> > ---
> > fs/gfs2/file.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++----------
> > 1 file changed, 52 insertions(+), 10 deletions(-)
> >
> Generally looks good, but a few minor points....
>
> > Index: gfs2-2.6-nmw/fs/gfs2/file.c
> > ================================================== =================
> > --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> > +++ gfs2-2.6-nmw/fs/gfs2/file.c
> > @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
> > return generic_file_aio_write(iocb, iov, nr_segs, pos);
> > }
> >
> > -static void empty_write_end(struct page *page, unsigned from,
> > - unsigned to)
> > +static int empty_write_end(struct page *page, unsigned from,
> > + unsigned to, int mode)
> > {
> > - struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> > + struct inode *inode = page->mapping->host;
> > + struct gfs2_inode *ip = GFS2_I(inode);
> > + struct buffer_head *bh;
> > + int waiting = 0;
> > + unsigned offset, blksize = 1 << inode->i_blkbits;
> > + loff_t i_size = i_size_read(inode);
> > + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> >
> > zero_user(page, from, to-from);
> > mark_page_accessed(page);
> >
> > - if (!gfs2_is_writeback(ip))
> > - gfs2_page_add_databufs(ip, page, from, to);
> > + if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> > + if (!gfs2_is_writeback(ip))
> > + gfs2_page_add_databufs(ip, page, from, to);
> >
> > - block_commit_write(page, from, to);
> > + block_commit_write(page, from, to);
> > + return 0;
> > + }
> > +
> > +second_pass:
> > + offset = 0;
> > + bh = page_buffers(page);
> > + while (offset < from) {
> > + offset += blksize;
> > + bh = bh->b_this_page;
> > + }
> > + while (offset < to) {
> > + if (waiting){
> > + wait_on_buffer(bh);
> > + if (!buffer_uptodate(bh))
> > + return -EIO;
> > + }
> > + else {
> The else should start on the line above to make it more obvious.
>
> > + set_buffer_uptodate(bh);
> > + mark_buffer_dirty(bh);
> > + clear_buffer_new(bh);
> > + write_dirty_buffer(bh, WRITE);
> Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?
>
> > + }
> > + offset += blksize;
> > + bh = bh->b_this_page;
> > + }
> > + if (!waiting) {
> > + waiting = 1;
> > + goto second_pass;
> > + }
> I think the code might be a bit cleaner if it was just written as two
> loops, one after the other since most of the loop content seems to be
> different according to weather "waiting" is set or not.
>
> Otherwise I think this is a good solution,
>
> Steve.
>


Sure. This patch kept evolving, and I probably should have given it a
better cleanup once I was satisfied that it worked. I send off a new
version.

-Ben

> > + return 0;
> > }
> >
> > static int needs_empty_write(sector_t block, struct inode *inode)
> > @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> > return !buffer_mapped(&bh_map);
> > }
> >
> > -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> > + int mode)
> > {
> > struct inode *inode = page->mapping->host;
> > unsigned start, end, next, blksize;
> > @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> > gfs2_block_map);
> > if (unlikely(ret))
> > return ret;
> > - empty_write_end(page, start, end);
> > + ret = empty_write_end(page, start, end, mode);
> > + if (unlikely(ret))
> > + return ret;
> > end = 0;
> > }
> > start = next;
> > @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> > ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> > if (unlikely(ret))
> > return ret;
> > - empty_write_end(page, start, end);
> > + ret = empty_write_end(page, start, end, mode);
> > + if (unlikely(ret))
> > + return ret;
> > }
> >
> > return 0;
> > @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode
> >
> > if (curr == end)
> > to = end_offset;
> > - error = write_empty_blocks(page, from, to);
> > + error = write_empty_blocks(page, from, to, mode);
> > if (!error && offset + to > inode->i_size &&
> > !(mode & FALLOC_FL_KEEP_SIZE)) {
> > i_size_write(inode, offset + to);
> >
>
 
Old 03-18-2011, 01:53 AM
Benjamin Marzinski
 
Default GFS2: directly write blocks past i_size

> Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?
>

I tried these, but they didn't cause any improvement. That doesn't seem
terribly surprising, since they both set NOIDLE, which tells the cfq
scheduler not to wait for more IO from this process. I event tried
sending all the buffers except the last with WRITE_ODIRECT_PLUG, and
then the last buffer from each page with WRITE_SYNC, which seems like it
should work better, but in all the tests I did, it didn't appear to make
any difference. I'm going to come back to this code after the release,
and see if I can find any way to make back some of the lost performance.

As it is right now, fallocates that grow i_size are doing better than
double the speed of dd. fallocates that go past i_size are an order of
magnitude slower than dd. But at least they work correctly.

> > + }
> > + offset += blksize;
> > + bh = bh->b_this_page;
> > + }
> > + if (!waiting) {
> > + waiting = 1;
> > + goto second_pass;
> > + }
> I think the code might be a bit cleaner if it was just written as two
> loops, one after the other since most of the loop content seems to be
> different according to weather "waiting" is set or not.
>
> Otherwise I think this is a good solution,
>
> Steve.
>
> > + return 0;
> > }
> >
> > static int needs_empty_write(sector_t block, struct inode *inode)
> > @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> > return !buffer_mapped(&bh_map);
> > }
> >
> > -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> > + int mode)
> > {
> > struct inode *inode = page->mapping->host;
> > unsigned start, end, next, blksize;
> > @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> > gfs2_block_map);
> > if (unlikely(ret))
> > return ret;
> > - empty_write_end(page, start, end);
> > + ret = empty_write_end(page, start, end, mode);
> > + if (unlikely(ret))
> > + return ret;
> > end = 0;
> > }
> > start = next;
> > @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> > ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> > if (unlikely(ret))
> > return ret;
> > - empty_write_end(page, start, end);
> > + ret = empty_write_end(page, start, end, mode);
> > + if (unlikely(ret))
> > + return ret;
> > }
> >
> > return 0;
> > @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode
> >
> > if (curr == end)
> > to = end_offset;
> > - error = write_empty_blocks(page, from, to);
> > + error = write_empty_blocks(page, from, to, mode);
> > if (!error && offset + to > inode->i_size &&
> > !(mode & FALLOC_FL_KEEP_SIZE)) {
> > i_size_write(inode, offset + to);
> >
>
 
Old 03-18-2011, 01:54 AM
Benjamin Marzinski
 
Default GFS2: directly write blocks past i_size

GFS2 was relying on the writepage code to write out the zeroed data for
fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
If it is, it will be ignored. To work around this, gfs2 now calls
write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
is set, and it's writing past i_size.

This version is just a cleanup of my last version

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
fs/gfs2/file.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 48 insertions(+), 10 deletions(-)

Index: gfs2-2.6-nmw/fs/gfs2/file.c
================================================== =================
--- gfs2-2.6-nmw.orig/fs/gfs2/file.c
+++ gfs2-2.6-nmw/fs/gfs2/file.c
@@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struc
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}

-static void empty_write_end(struct page *page, unsigned from,
- unsigned to)
+static int empty_write_end(struct page *page, unsigned from,
+ unsigned to, int mode)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ unsigned offset, blksize = 1 << inode->i_blkbits;
+ pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;

zero_user(page, from, to-from);
mark_page_accessed(page);

- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
+ if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (!gfs2_is_writeback(ip))
+ gfs2_page_add_databufs(ip, page, from, to);
+
+ block_commit_write(page, from, to);
+ return 0;
+ }
+
+ offset = 0;
+ bh = page_buffers(page);
+ while (offset < to) {
+ if (offset >= from) {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ clear_buffer_new(bh);
+ write_dirty_buffer(bh, WRITE);
+ }
+ offset += blksize;
+ bh = bh->b_this_page;
+ }

- block_commit_write(page, from, to);
+ offset = 0;
+ bh = page_buffers(page);
+ while (offset < to) {
+ if (offset >= from) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ return -EIO;
+ }
+ offset += blksize;
+ bh = bh->b_this_page;
+ }
+ return 0;
}

static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +676,8 @@ static int needs_empty_write(sector_t bl
return !buffer_mapped(&bh_map);
}

-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+ int mode)
{
struct inode *inode = page->mapping->host;
unsigned start, end, next, blksize;
@@ -668,7 +702,9 @@ static int write_empty_blocks(struct pag
gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
end = 0;
}
start = next;
@@ -682,7 +718,9 @@ static int write_empty_blocks(struct pag
ret = __block_write_begin(page, start, end - start, gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
}

return 0;
@@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode

if (curr == end)
to = end_offset;
- error = write_empty_blocks(page, from, to);
+ error = write_empty_blocks(page, from, to, mode);
if (!error && offset + to > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
i_size_write(inode, offset + to);
 
Old 03-21-2011, 09:08 AM
Steven Whitehouse
 
Default GFS2: directly write blocks past i_size

Hi,

That looks much better, thanks for fixing that. I've pushed it into the
-fixes tree,

Steve.

On Thu, 2011-03-17 at 21:54 -0500, Benjamin Marzinski wrote:
> GFS2 was relying on the writepage code to write out the zeroed data for
> fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> If it is, it will be ignored. To work around this, gfs2 now calls
> write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> is set, and it's writing past i_size.
>
> This version is just a cleanup of my last version
>
> Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> ---
> fs/gfs2/file.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 48 insertions(+), 10 deletions(-)
>
> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ================================================== =================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struc
> return generic_file_aio_write(iocb, iov, nr_segs, pos);
> }
>
> -static void empty_write_end(struct page *page, unsigned from,
> - unsigned to)
> +static int empty_write_end(struct page *page, unsigned from,
> + unsigned to, int mode)
> {
> - struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> + struct inode *inode = page->mapping->host;
> + struct gfs2_inode *ip = GFS2_I(inode);
> + struct buffer_head *bh;
> + unsigned offset, blksize = 1 << inode->i_blkbits;
> + pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
>
> zero_user(page, from, to-from);
> mark_page_accessed(page);
>
> - if (!gfs2_is_writeback(ip))
> - gfs2_page_add_databufs(ip, page, from, to);
> + if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> + if (!gfs2_is_writeback(ip))
> + gfs2_page_add_databufs(ip, page, from, to);
> +
> + block_commit_write(page, from, to);
> + return 0;
> + }
> +
> + offset = 0;
> + bh = page_buffers(page);
> + while (offset < to) {
> + if (offset >= from) {
> + set_buffer_uptodate(bh);
> + mark_buffer_dirty(bh);
> + clear_buffer_new(bh);
> + write_dirty_buffer(bh, WRITE);
> + }
> + offset += blksize;
> + bh = bh->b_this_page;
> + }
>
> - block_commit_write(page, from, to);
> + offset = 0;
> + bh = page_buffers(page);
> + while (offset < to) {
> + if (offset >= from) {
> + wait_on_buffer(bh);
> + if (!buffer_uptodate(bh))
> + return -EIO;
> + }
> + offset += blksize;
> + bh = bh->b_this_page;
> + }
> + return 0;
> }
>
> static int needs_empty_write(sector_t block, struct inode *inode)
> @@ -643,7 +676,8 @@ static int needs_empty_write(sector_t bl
> return !buffer_mapped(&bh_map);
> }
>
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> + int mode)
> {
> struct inode *inode = page->mapping->host;
> unsigned start, end, next, blksize;
> @@ -668,7 +702,9 @@ static int write_empty_blocks(struct pag
> gfs2_block_map);
> if (unlikely(ret))
> return ret;
> - empty_write_end(page, start, end);
> + ret = empty_write_end(page, start, end, mode);
> + if (unlikely(ret))
> + return ret;
> end = 0;
> }
> start = next;
> @@ -682,7 +718,9 @@ static int write_empty_blocks(struct pag
> ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> if (unlikely(ret))
> return ret;
> - empty_write_end(page, start, end);
> + ret = empty_write_end(page, start, end, mode);
> + if (unlikely(ret))
> + return ret;
> }
>
> return 0;
> @@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode
>
> if (curr == end)
> to = end_offset;
> - error = write_empty_blocks(page, from, to);
> + error = write_empty_blocks(page, from, to, mode);
> if (!error && offset + to > inode->i_size &&
> !(mode & FALLOC_FL_KEEP_SIZE)) {
> i_size_write(inode, offset + to);
>
 
Old 04-19-2011, 08:40 AM
Steven Whitehouse
 
Default GFS2: directly write blocks past i_size

From: Benjamin Marzinski <bmarzins@redhat.com>

GFS2 was relying on the writepage code to write out the zeroed data for
fallocate. However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
If it is, it will be ignored. To work around this, gfs2 now calls
write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
is set, and it's writing past i_size.

This version is just a cleanup of my last version

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b2682e0..e483108 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -617,18 +617,51 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}

-static void empty_write_end(struct page *page, unsigned from,
- unsigned to)
+static int empty_write_end(struct page *page, unsigned from,
+ unsigned to, int mode)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ unsigned offset, blksize = 1 << inode->i_blkbits;
+ pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;

zero_user(page, from, to-from);
mark_page_accessed(page);

- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
+ if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (!gfs2_is_writeback(ip))
+ gfs2_page_add_databufs(ip, page, from, to);
+
+ block_commit_write(page, from, to);
+ return 0;
+ }
+
+ offset = 0;
+ bh = page_buffers(page);
+ while (offset < to) {
+ if (offset >= from) {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ clear_buffer_new(bh);
+ write_dirty_buffer(bh, WRITE);
+ }
+ offset += blksize;
+ bh = bh->b_this_page;
+ }

- block_commit_write(page, from, to);
+ offset = 0;
+ bh = page_buffers(page);
+ while (offset < to) {
+ if (offset >= from) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ return -EIO;
+ }
+ offset += blksize;
+ bh = bh->b_this_page;
+ }
+ return 0;
}

static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +676,8 @@ static int needs_empty_write(sector_t block, struct inode *inode)
return !buffer_mapped(&bh_map);
}

-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+ int mode)
{
struct inode *inode = page->mapping->host;
unsigned start, end, next, blksize;
@@ -668,7 +702,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
end = 0;
}
start = next;
@@ -682,7 +718,9 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
ret = __block_write_begin(page, start, end - start, gfs2_block_map);
if (unlikely(ret))
return ret;
- empty_write_end(page, start, end);
+ ret = empty_write_end(page, start, end, mode);
+ if (unlikely(ret))
+ return ret;
}

return 0;
@@ -731,7 +769,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,

if (curr == end)
to = end_offset;
- error = write_empty_blocks(page, from, to);
+ error = write_empty_blocks(page, from, to, mode);
if (!error && offset + to > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
i_size_write(inode, offset + to);
--
1.7.4
 

Thread Tools




All times are GMT. The time now is 10:14 PM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.
Copyright 2007 - 2008, www.linux-archive.org