FAQ Search Today's Posts Mark Forums Read
» Video Reviews

» Linux Archive

Linux-archive is a website aiming to archive linux email lists and to make them easily accessible for linux users/developers.


» Sponsor

» Partners

» Sponsor


 
 
LinkBack Thread Tools
 
Old 08-06-2010, 04:04 PM
Tejun Heo
 
Default relaxed barriers

Hello,

So, here's my shot at it. After this patch, barrier no longer
dictates the ordering of other requests. The block layer sequences
the barrier request without interfering with other requests (not even
elevator draining). Multiple pending barriers are handled by saving
those in a separate queue and servicing them one by one. Basically,
barrier sequences form a separate FIFO command stream independent of
other requests and all the ordering between the two streams is
filesystem's responsibility.

Ordered tag support is dropped as no one seems to be making any
meaningful use of it. I'm fairly skeptical about its usefulness
anyway. The only thing ordered tag saves is latencies between command
completions and issues in barrier sequences, which isn't much to begin
with and puts additional ordering restrictions compared to ordering in
software (ordered tag commands will unnecessary affect processing of
simple tag commands).

Lightly tested for all three BAR (!WC), FLUSH and FUA cases. The
multiple pending barrier code path isn't tested yet.

Christoph, does this look like something the filesystems can use or
have I misunderstood something?

Thanks.

NOT_SIGNED_OFF_YET
---
block/blk-barrier.c | 253 +++++++++++++++----------------------------
block/blk-core.c | 31 ++---
block/blk.h | 5
block/elevator.c | 80 +------------
drivers/block/brd.c | 2
drivers/block/loop.c | 2
drivers/block/osdblk.c | 2
drivers/block/pktcdvd.c | 1
drivers/block/ps3disk.c | 3
drivers/block/virtio_blk.c | 4
drivers/block/xen-blkfront.c | 2
drivers/ide/ide-disk.c | 4
drivers/md/dm.c | 3
drivers/mmc/card/queue.c | 2
drivers/s390/block/dasd.c | 2
drivers/scsi/sd.c | 8 -
include/linux/blkdev.h | 59 +++-------
include/linux/elevator.h | 6 -
18 files changed, 154 insertions(+), 315 deletions(-)

Index: work/block/blk-barrier.c
================================================== =================
--- work.orig/block/blk-barrier.c
+++ work/block/blk-barrier.c
@@ -9,6 +9,8 @@

#include "blk.h"

+static struct request *queue_next_ordseq(struct request_queue *q);
+
/**
* blk_queue_ordered - does this queue support ordered writes
* @q: the request queue
@@ -31,13 +33,8 @@ int blk_queue_ordered(struct request_que
return -EINVAL;
}

- if (ordered != QUEUE_ORDERED_NONE &&
- ordered != QUEUE_ORDERED_DRAIN &&
- ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
- ordered != QUEUE_ORDERED_DRAIN_FUA &&
- ordered != QUEUE_ORDERED_TAG &&
- ordered != QUEUE_ORDERED_TAG_FLUSH &&
- ordered != QUEUE_ORDERED_TAG_FUA) {
+ if (ordered != QUEUE_ORDERED_NONE && ordered != QUEUE_ORDERED_BAR &&
+ ordered != QUEUE_ORDERED_FLUSH && ordered != QUEUE_ORDERED_FUA) {
printk(KERN_ERR "blk_queue_ordered: bad value %d
", ordered);
return -EINVAL;
}
@@ -60,38 +57,10 @@ unsigned blk_ordered_cur_seq(struct requ
return 1 << ffz(q->ordseq);
}

-unsigned blk_ordered_req_seq(struct request *rq)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+ unsigned seq, int error)
{
- struct request_queue *q = rq->q;
-
- BUG_ON(q->ordseq == 0);
-
- if (rq == &q->pre_flush_rq)
- return QUEUE_ORDSEQ_PREFLUSH;
- if (rq == &q->bar_rq)
- return QUEUE_ORDSEQ_BAR;
- if (rq == &q->post_flush_rq)
- return QUEUE_ORDSEQ_POSTFLUSH;
-
- /*
- * !fs requests don't need to follow barrier ordering. Always
- * put them at the front. This fixes the following deadlock.
- *
- * http://thread.gmane.org/gmane.linux.kernel/537473
- */
- if (!blk_fs_request(rq))
- return QUEUE_ORDSEQ_DRAIN;
-
- if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
- (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
- return QUEUE_ORDSEQ_DRAIN;
- else
- return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
-{
- struct request *rq;
+ struct request *rq = NULL;

if (error && !q->orderr)
q->orderr = error;
@@ -99,16 +68,22 @@ bool blk_ordered_complete_seq(struct req
BUG_ON(q->ordseq & seq);
q->ordseq |= seq;

- if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
- return false;
-
- /*
- * Okay, sequence complete.
- */
- q->ordseq = 0;
- rq = q->orig_bar_rq;
- __blk_end_request_all(rq, q->orderr);
- return true;
+ if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+ /* not complete yet, queue the next ordered sequence */
+ rq = queue_next_ordseq(q);
+ } else {
+ /* complete this barrier request */
+ __blk_end_request_all(q->orig_bar_rq, q->orderr);
+ q->orig_bar_rq = NULL;
+ q->ordseq = 0;
+
+ /* dispatch the next barrier if there's one */
+ if (!list_empty(&q->pending_barriers)) {
+ rq = list_entry_rq(q->pending_barriers.next);
+ list_move(&rq->queuelist, &q->queue_head);
+ }
+ }
+ return rq;
}

static void pre_flush_end_io(struct request *rq, int error)
@@ -129,21 +104,10 @@ static void post_flush_end_io(struct req
blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
}

-static void queue_flush(struct request_queue *q, unsigned which)
+static void queue_flush(struct request_queue *q, struct request *rq,
+ rq_end_io_fn *end_io)
{
- struct request *rq;
- rq_end_io_fn *end_io;
-
- if (which == QUEUE_ORDERED_DO_PREFLUSH) {
- rq = &q->pre_flush_rq;
- end_io = pre_flush_end_io;
- } else {
- rq = &q->post_flush_rq;
- end_io = post_flush_end_io;
- }
-
blk_rq_init(q, rq);
- rq->cmd_flags = REQ_HARDBARRIER;
rq->rq_disk = q->bar_rq.rq_disk;
rq->end_io = end_io;
q->prepare_flush_fn(q, rq);
@@ -151,130 +115,93 @@ static void queue_flush(struct request_q
elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}

-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+static struct request *queue_next_ordseq(struct request_queue *q)
{
- struct request *rq = *rqp;
- unsigned skip = 0;
-
- q->orderr = 0;
- q->ordered = q->next_ordered;
- q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
- /*
- * For an empty barrier, there's no actual BAR request, which
- * in turn makes POSTFLUSH unnecessary. Mask them off.
- */
- if (!blk_rq_sectors(rq)) {
- q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
- QUEUE_ORDERED_DO_POSTFLUSH);
- /*
- * Empty barrier on a write-through device w/ ordered
- * tag has no command to issue and without any command
- * to issue, ordering by tag can't be used. Drain
- * instead.
- */
- if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
- !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
- q->ordered &= ~QUEUE_ORDERED_BY_TAG;
- }
-
- /* stash away the original request */
- blk_dequeue_request(rq);
- q->orig_bar_rq = rq;
- rq = NULL;
-
- /*
- * Queue ordered sequence. As we stack them at the head, we
- * need to queue in reverse order. Note that we rely on that
- * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
- * request gets inbetween ordered sequence.
- */
- if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
- rq = &q->post_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_POSTFLUSH;
+ struct request *rq = &q->bar_rq;

- if (q->ordered & QUEUE_ORDERED_DO_BAR) {
- rq = &q->bar_rq;
+ switch (blk_ordered_cur_seq(q)) {
+ case QUEUE_ORDSEQ_PREFLUSH:
+ queue_flush(q, rq, pre_flush_end_io);
+ break;

+ case QUEUE_ORDSEQ_BAR:
/* initialize proxy request and queue it */
blk_rq_init(q, rq);
- if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
- rq->cmd_flags |= REQ_RW;
+ init_request_from_bio(rq, q->orig_bar_rq->bio);
+ rq->cmd_flags &= ~REQ_HARDBARRIER;
if (q->ordered & QUEUE_ORDERED_DO_FUA)
rq->cmd_flags |= REQ_FUA;
- init_request_from_bio(rq, q->orig_bar_rq->bio);
rq->end_io = bar_end_io;

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
- } else
- skip |= QUEUE_ORDSEQ_BAR;
+ break;

- if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
- rq = &q->pre_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_PREFLUSH;
+ case QUEUE_ORDSEQ_POSTFLUSH:
+ queue_flush(q, rq, post_flush_end_io);
+ break;

- if (!(q->ordered & QUEUE_ORDERED_BY_TAG) && queue_in_flight(q))
- rq = NULL;
- else
- skip |= QUEUE_ORDSEQ_DRAIN;
-
- *rqp = rq;
-
- /*
- * Complete skipped sequences. If whole sequence is complete,
- * return false to tell elevator that this request is gone.
- */
- return !blk_ordered_complete_seq(q, skip, 0);
+ default:
+ BUG();
+ }
+ return rq;
}

-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
{
- struct request *rq = *rqp;
- const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+ unsigned skip = 0;

- if (!q->ordseq) {
- if (!is_barrier)
- return true;
-
- if (q->next_ordered != QUEUE_ORDERED_NONE)
- return start_ordered(q, rqp);
- else {
- /*
- * Queue ordering not supported. Terminate
- * with prejudice.
- */
- blk_dequeue_request(rq);
- __blk_end_request_all(rq, -EOPNOTSUPP);
- *rqp = NULL;
- return false;
- }
+ if (!blk_barrier_rq(rq))
+ return rq;
+
+ if (q->ordseq) {
+ /*
+ * Barrier is already in progress and they can't be
+ * processed in parallel. Queue for later processing.
+ */
+ list_move_tail(&rq->queuelist, &q->pending_barriers);
+ return NULL;
+ }
+
+ if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+ /*
+ * Queue ordering not supported. Terminate
+ * with prejudice.
+ */
+ blk_dequeue_request(rq);
+ __blk_end_request_all(rq, -EOPNOTSUPP);
+ return NULL;
}

/*
- * Ordered sequence in progress
+ * Start a new ordered sequence
*/
+ q->orderr = 0;
+ q->ordered = q->next_ordered;
+ q->ordseq |= QUEUE_ORDSEQ_STARTED;

- /* Special requests are not subject to ordering rules. */
- if (!blk_fs_request(rq) &&
- rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
- return true;
-
- if (q->ordered & QUEUE_ORDERED_BY_TAG) {
- /* Ordered by tag. Blocking the next barrier is enough. */
- if (is_barrier && rq != &q->bar_rq)
- *rqp = NULL;
- } else {
- /* Ordered by draining. Wait for turn. */
- WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
- if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
- *rqp = NULL;
- }
+ /*
+ * For an empty barrier, there's no actual BAR request, which
+ * in turn makes POSTFLUSH unnecessary. Mask them off.
+ */
+ if (!blk_rq_sectors(rq))
+ q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+ QUEUE_ORDERED_DO_POSTFLUSH);
+
+ /* stash away the original request */
+ blk_dequeue_request(rq);
+ q->orig_bar_rq = rq;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+ skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+ skip |= QUEUE_ORDSEQ_BAR;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+ skip |= QUEUE_ORDSEQ_POSTFLUSH;

- return true;
+ /* complete skipped sequences and return the first sequence */
+ return blk_ordered_complete_seq(q, skip, 0);
}

static void bio_end_empty_barrier(struct bio *bio, int err)
Index: work/include/linux/blkdev.h
================================================== =================
--- work.orig/include/linux/blkdev.h
+++ work/include/linux/blkdev.h
@@ -106,7 +106,6 @@ enum rq_flag_bits {
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests */
- __REQ_ORDERED_COLOR, /* is before or after barrier */
__REQ_RW_SYNC, /* request is sync (sync write or read) */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_RW_META, /* metadata io request */
@@ -135,7 +134,6 @@ enum rq_flag_bits {
#define REQ_FAILED (1 << __REQ_FAILED)
#define REQ_QUIET (1 << __REQ_QUIET)
#define REQ_PREEMPT (1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
#define REQ_RW_SYNC (1 << __REQ_RW_SYNC)
#define REQ_ALLOCED (1 << __REQ_ALLOCED)
#define REQ_RW_META (1 << __REQ_RW_META)
@@ -437,9 +435,10 @@ struct request_queue
* reserved for flush operations
*/
unsigned int ordered, next_ordered, ordseq;
- int orderr, ordcolor;
- struct request pre_flush_rq, bar_rq, post_flush_rq;
- struct request *orig_bar_rq;
+ int orderr;
+ struct request bar_rq;
+ struct request *orig_bar_rq;
+ struct list_head pending_barriers;

struct mutex sysfs_lock;

@@ -543,47 +542,33 @@ enum {
* Hardbarrier is supported with one of the following methods.
*
* NONE : hardbarrier unsupported
- * DRAIN : ordering by draining is enough
- * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
- * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
- * TAG : ordering by tag is enough
- * TAG_FLUSH : ordering by tag w/ pre and post flushes
- * TAG_FUA : ordering by tag w/ pre flush and FUA write
- */
- QUEUE_ORDERED_BY_TAG = 0x02,
- QUEUE_ORDERED_DO_PREFLUSH = 0x10,
- QUEUE_ORDERED_DO_BAR = 0x20,
- QUEUE_ORDERED_DO_POSTFLUSH = 0x40,
- QUEUE_ORDERED_DO_FUA = 0x80,
+ * BAR : writing out barrier is enough
+ * FLUSH : barrier and surrounding pre and post flushes
+ * FUA : FUA barrier w/ pre flush
+ */
+ QUEUE_ORDERED_DO_PREFLUSH = 1 << 0,
+ QUEUE_ORDERED_DO_BAR = 1 << 1,
+ QUEUE_ORDERED_DO_POSTFLUSH = 1 << 2,
+ QUEUE_ORDERED_DO_FUA = 1 << 3,

- QUEUE_ORDERED_NONE = 0x00,
+ QUEUE_ORDERED_NONE = 0,

- QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
+ QUEUE_ORDERED_BAR = QUEUE_ORDERED_DO_BAR,
+ QUEUE_ORDERED_FLUSH = QUEUE_ORDERED_DO_BAR |
QUEUE_ORDERED_DO_PREFLUSH |
QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
-
- QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
+ QUEUE_ORDERED_FUA = QUEUE_ORDERED_DO_BAR |
QUEUE_ORDERED_DO_PREFLUSH |
QUEUE_ORDERED_DO_FUA,

/*
* Ordered operation sequence
*/
- QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
- QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
- QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
- QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
- QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
- QUEUE_ORDSEQ_DONE = 0x20,
+ QUEUE_ORDSEQ_STARTED = (1 << 0), /* flushing in progress */
+ QUEUE_ORDSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
+ QUEUE_ORDSEQ_BAR = (1 << 2), /* barrier write in progress */
+ QUEUE_ORDSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
+ QUEUE_ORDSEQ_DONE = (1 << 4),
};

#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -965,10 +950,8 @@ extern void blk_queue_rq_timed_out(struc
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
extern unsigned blk_ordered_cur_seq(struct request_queue *);
extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);

extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
Index: work/drivers/block/brd.c
================================================== =================
--- work.orig/drivers/block/brd.c
+++ work/drivers/block/brd.c
@@ -479,7 +479,7 @@ static struct brd_device *brd_alloc(int
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
+ blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_BAR, NULL);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);

Index: work/drivers/block/virtio_blk.c
================================================== =================
--- work.orig/drivers/block/virtio_blk.c
+++ work/drivers/block/virtio_blk.c
@@ -368,10 +368,10 @@ static int __devinit virtblk_probe(struc

/* If barriers are supported, tell block layer that queue is ordered */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH,
+ blk_queue_ordered(q, QUEUE_ORDERED_FLUSH,
virtblk_prepare_flush);
else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
- blk_queue_ordered(q, QUEUE_ORDERED_TAG, NULL);
+ blk_queue_ordered(q, QUEUE_ORDERED_BAR, NULL);

/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
Index: work/drivers/scsi/sd.c
================================================== =================
--- work.orig/drivers/scsi/sd.c
+++ work/drivers/scsi/sd.c
@@ -2103,15 +2103,13 @@ static int sd_revalidate_disk(struct gen

/*
* We now have all cache related info, determine how we deal
- * with ordered requests. Note that as the current SCSI
- * dispatch function can alter request order, we cannot use
- * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+ * with ordered requests.
*/
if (sdkp->WCE)
ordered = sdkp->DPOFUA
- ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
+ ? QUEUE_ORDERED_FUA : QUEUE_ORDERED_FLUSH;
else
- ordered = QUEUE_ORDERED_DRAIN;
+ ordered = QUEUE_ORDERED_BAR;

blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);

Index: work/block/blk-core.c
================================================== =================
--- work.orig/block/blk-core.c
+++ work/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_no
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->pending_barriers);
INIT_WORK(&q->unplug_work, blk_unplug_work);

kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1036,22 +1037,6 @@ void blk_insert_request(struct request_q
}
EXPORT_SYMBOL(blk_insert_request);

-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
- drive_stat_acct(req, 1);
-
- /*
- * elevator indicated where it wants this request to be
- * inserted at elevator_merge time
- */
- __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
@@ -1184,6 +1169,7 @@ static int __make_request(struct request
const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ int where = ELEVATOR_INSERT_SORT;
int rw_flags;

if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
@@ -1191,6 +1177,7 @@ static int __make_request(struct request
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
+
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1200,7 +1187,12 @@ static int __make_request(struct request

spin_lock_irq(q->queue_lock);

- if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
+ if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
+ where = ELEVATOR_INSERT_ORDERED;
+ goto get_rq;
+ }
+
+ if (elv_queue_empty(q))
goto get_rq;

el_ret = elv_merge(q, &req, bio);
@@ -1297,7 +1289,10 @@ get_rq:
req->cpu = blk_cpu_to_group(smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
- add_request(q, req);
+
+ /* insert the request into the elevator */
+ drive_stat_acct(req, 1);
+ __elv_add_request(q, req, where, 0);
out:
if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
Index: work/block/elevator.c
================================================== =================
--- work.orig/block/elevator.c
+++ work/block/elevator.c
@@ -564,7 +564,7 @@ void elv_requeue_request(struct request_

rq->cmd_flags &= ~REQ_STARTED;

- elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+ elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}

void elv_drain_elevator(struct request_queue *q)
@@ -611,8 +611,6 @@ void elv_quiesce_end(struct request_queu

void elv_insert(struct request_queue *q, struct request *rq, int where)
{
- struct list_head *pos;
- unsigned ordseq;
int unplug_it = 1;

trace_block_rq_insert(q, rq);
@@ -622,10 +620,14 @@ void elv_insert(struct request_queue *q,
switch (where) {
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
-
list_add(&rq->queuelist, &q->queue_head);
break;

+ case ELEVATOR_INSERT_ORDERED:
+ rq->cmd_flags |= REQ_SOFTBARRIER;
+ list_add_tail(&rq->queuelist, &q->queue_head);
+ break;
+
case ELEVATOR_INSERT_BACK:
rq->cmd_flags |= REQ_SOFTBARRIER;
elv_drain_elevator(q);
@@ -661,36 +663,6 @@ void elv_insert(struct request_queue *q,
q->elevator->ops->elevator_add_req_fn(q, rq);
break;

- case ELEVATOR_INSERT_REQUEUE:
- /*
- * If ordered flush isn't in progress, we do front
- * insertion; otherwise, requests should be requeued
- * in ordseq order.
- */
- rq->cmd_flags |= REQ_SOFTBARRIER;
-
- /*
- * Most requeues happen because of a busy condition,
- * don't force unplug of the queue for that case.
- */
- unplug_it = 0;
-
- if (q->ordseq == 0) {
- list_add(&rq->queuelist, &q->queue_head);
- break;
- }
-
- ordseq = blk_ordered_req_seq(rq);
-
- list_for_each(pos, &q->queue_head) {
- struct request *pos_rq = list_entry_rq(pos);
- if (ordseq <= blk_ordered_req_seq(pos_rq))
- break;
- }
-
- list_add_tail(&rq->queuelist, pos);
- break;
-
default:
printk(KERN_ERR "%s: bad insertion point %d
",
__func__, where);
@@ -709,32 +681,14 @@ void elv_insert(struct request_queue *q,
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
int plug)
{
- if (q->ordcolor)
- rq->cmd_flags |= REQ_ORDERED_COLOR;
-
if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
- /*
- * toggle ordered color
- */
- if (blk_barrier_rq(rq))
- q->ordcolor ^= 1;
-
- /*
- * barriers implicitly indicate back insertion
- */
- if (where == ELEVATOR_INSERT_SORT)
- where = ELEVATOR_INSERT_BACK;
-
- /*
- * this request is scheduling boundary, update
- * end_sector
- */
+ /* barriers are scheduling boundary, update end_sector */
if (blk_fs_request(rq) || blk_discard_rq(rq)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
- where == ELEVATOR_INSERT_SORT)
+ where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;

if (plug)
@@ -846,24 +800,6 @@ void elv_completed_request(struct reques
if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
e->ops->elevator_completed_req_fn(q, rq);
}
-
- /*
- * Check if the queue is waiting for fs requests to be
- * drained for flush sequence.
- */
- if (unlikely(q->ordseq)) {
- struct request *next = NULL;
-
- if (!list_empty(&q->queue_head))
- next = list_entry_rq(q->queue_head.next);
-
- if (!queue_in_flight(q) &&
- blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
- (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
- blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
- __blk_run_queue(q);
- }
- }
}

#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
Index: work/block/blk.h
================================================== =================
--- work.orig/block/blk.h
+++ work/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))

+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
@@ -58,7 +60,8 @@ static inline struct request *__elv_next
while (1) {
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (blk_do_ordered(q, &rq))
+ rq = blk_do_ordered(q, rq);
+ if (rq)
return rq;
}

Index: work/drivers/block/loop.c
================================================== =================
--- work.orig/drivers/block/loop.c
+++ work/drivers/block/loop.c
@@ -831,7 +831,7 @@ static int loop_set_fd(struct loop_devic
lo->lo_queue->unplug_fn = loop_unplug;

if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_BAR, NULL);

set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
Index: work/drivers/block/osdblk.c
================================================== =================
--- work.orig/drivers/block/osdblk.c
+++ work/drivers/block/osdblk.c
@@ -446,7 +446,7 @@ static int osdblk_init_disk(struct osdbl
blk_queue_stack_limits(q, osd_request_queue(osdev->osd));

blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
+ blk_queue_ordered(q, QUEUE_ORDERED_FLUSH, osdblk_prepare_flush);

disk->queue = q;

Index: work/drivers/block/ps3disk.c
================================================== =================
--- work.orig/drivers/block/ps3disk.c
+++ work/drivers/block/ps3disk.c
@@ -480,8 +480,7 @@ static int __devinit ps3disk_probe(struc
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);

- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
- ps3disk_prepare_flush);
+ blk_queue_ordered(queue, QUEUE_ORDERED_FLUSH, ps3disk_prepare_flush);

blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
Index: work/drivers/block/xen-blkfront.c
================================================== =================
--- work.orig/drivers/block/xen-blkfront.c
+++ work/drivers/block/xen-blkfront.c
@@ -373,7 +373,7 @@ static int xlvbd_barrier(struct blkfront
int err;

err = blk_queue_ordered(info->rq,
- info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+ info->feature_barrier ? QUEUE_ORDERED_BAR : QUEUE_ORDERED_NONE,
NULL);

if (err)
Index: work/drivers/ide/ide-disk.c
================================================== =================
--- work.orig/drivers/ide/ide-disk.c
+++ work/drivers/ide/ide-disk.c
@@ -537,11 +537,11 @@ static void update_ordered(ide_drive_t *
drive->name, barrier ? "" : "not ");

if (barrier) {
- ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+ ordered = QUEUE_ORDERED_FLUSH;
prep_fn = idedisk_prepare_flush;
}
} else
- ordered = QUEUE_ORDERED_DRAIN;
+ ordered = QUEUE_ORDERED_BAR;

blk_queue_ordered(drive->queue, ordered, prep_fn);
}
Index: work/drivers/md/dm.c
================================================== =================
--- work.orig/drivers/md/dm.c
+++ work/drivers/md/dm.c
@@ -1912,8 +1912,7 @@ static struct mapped_device *alloc_dev(i
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
- dm_rq_prepare_flush);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_FLUSH, dm_rq_prepare_flush);

md->disk = alloc_disk(1);
if (!md->disk)
Index: work/drivers/mmc/card/queue.c
================================================== =================
--- work.orig/drivers/mmc/card/queue.c
+++ work/drivers/mmc/card/queue.c
@@ -128,7 +128,7 @@ int mmc_init_queue(struct mmc_queue *mq,
mq->req = NULL;

blk_queue_prep_rq(mq->queue, mmc_prep_request);
- blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(mq->queue, QUEUE_ORDERED_BAR, NULL);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);

#ifdef CONFIG_MMC_BLOCK_BOUNCE
Index: work/drivers/s390/block/dasd.c
================================================== =================
--- work.orig/drivers/s390/block/dasd.c
+++ work/drivers/s390/block/dasd.c
@@ -2196,7 +2196,7 @@ static void dasd_setup_queue(struct dasd
*/
blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
- blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(block->request_queue, QUEUE_ORDERED_BAR, NULL);
}

/*
Index: work/include/linux/elevator.h
================================================== =================
--- work.orig/include/linux/elevator.h
+++ work/include/linux/elevator.h
@@ -162,9 +162,9 @@ extern struct request *elv_rb_find(struc
* Insertion selection
*/
#define ELEVATOR_INSERT_FRONT 1
-#define ELEVATOR_INSERT_BACK 2
-#define ELEVATOR_INSERT_SORT 3
-#define ELEVATOR_INSERT_REQUEUE 4
+#define ELEVATOR_INSERT_ORDERED 2
+#define ELEVATOR_INSERT_BACK 3
+#define ELEVATOR_INSERT_SORT 4

/*
* return values from elevator_may_queue_fn
Index: work/drivers/block/pktcdvd.c
================================================== =================
--- work.orig/drivers/block/pktcdvd.c
+++ work/drivers/block/pktcdvd.c
@@ -752,7 +752,6 @@ static int pkt_generic_packet(struct pkt

rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 08-06-2010, 11:34 PM
Christoph Hellwig
 
Default relaxed barriers

> Christoph, does this look like something the filesystems can use or
> have I misunderstood something?

This sounds very useful. I'll review and test it once I get a bit time.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 08-07-2010, 10:13 AM
Tejun Heo
 
Default relaxed barriers

The patch was on top of v2.6.35 but was generated against dirty tree
and wouldn't apply cleanly. Here's the proper one.

Thanks.
---
block/blk-barrier.c | 255 +++++++++++++++----------------------------
block/blk-core.c | 31 ++---
block/blk.h | 5
block/elevator.c | 80 +------------
drivers/block/brd.c | 2
drivers/block/loop.c | 2
drivers/block/osdblk.c | 2
drivers/block/pktcdvd.c | 1
drivers/block/ps3disk.c | 3
drivers/block/virtio_blk.c | 4
drivers/block/xen-blkfront.c | 2
drivers/ide/ide-disk.c | 4
drivers/md/dm.c | 3
drivers/mmc/card/queue.c | 2
drivers/s390/block/dasd.c | 2
drivers/scsi/sd.c | 8 -
include/linux/blkdev.h | 63 +++-------
include/linux/elevator.h | 6 -
18 files changed, 155 insertions(+), 320 deletions(-)

Index: work/block/blk-barrier.c
================================================== =================
--- work.orig/block/blk-barrier.c
+++ work/block/blk-barrier.c
@@ -9,6 +9,8 @@

#include "blk.h"

+static struct request *queue_next_ordseq(struct request_queue *q);
+
/**
* blk_queue_ordered - does this queue support ordered writes
* @q: the request queue
@@ -31,13 +33,8 @@ int blk_queue_ordered(struct request_que
return -EINVAL;
}

- if (ordered != QUEUE_ORDERED_NONE &&
- ordered != QUEUE_ORDERED_DRAIN &&
- ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
- ordered != QUEUE_ORDERED_DRAIN_FUA &&
- ordered != QUEUE_ORDERED_TAG &&
- ordered != QUEUE_ORDERED_TAG_FLUSH &&
- ordered != QUEUE_ORDERED_TAG_FUA) {
+ if (ordered != QUEUE_ORDERED_NONE && ordered != QUEUE_ORDERED_BAR &&
+ ordered != QUEUE_ORDERED_FLUSH && ordered != QUEUE_ORDERED_FUA) {
printk(KERN_ERR "blk_queue_ordered: bad value %d
", ordered);
return -EINVAL;
}
@@ -60,38 +57,10 @@ unsigned blk_ordered_cur_seq(struct requ
return 1 << ffz(q->ordseq);
}

-unsigned blk_ordered_req_seq(struct request *rq)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+ unsigned seq, int error)
{
- struct request_queue *q = rq->q;
-
- BUG_ON(q->ordseq == 0);
-
- if (rq == &q->pre_flush_rq)
- return QUEUE_ORDSEQ_PREFLUSH;
- if (rq == &q->bar_rq)
- return QUEUE_ORDSEQ_BAR;
- if (rq == &q->post_flush_rq)
- return QUEUE_ORDSEQ_POSTFLUSH;
-
- /*
- * !fs requests don't need to follow barrier ordering. Always
- * put them at the front. This fixes the following deadlock.
- *
- * http://thread.gmane.org/gmane.linux.kernel/537473
- */
- if (!blk_fs_request(rq))
- return QUEUE_ORDSEQ_DRAIN;
-
- if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
- (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
- return QUEUE_ORDSEQ_DRAIN;
- else
- return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
-{
- struct request *rq;
+ struct request *rq = NULL;

if (error && !q->orderr)
q->orderr = error;
@@ -99,16 +68,22 @@ bool blk_ordered_complete_seq(struct req
BUG_ON(q->ordseq & seq);
q->ordseq |= seq;

- if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
- return false;
-
- /*
- * Okay, sequence complete.
- */
- q->ordseq = 0;
- rq = q->orig_bar_rq;
- __blk_end_request_all(rq, q->orderr);
- return true;
+ if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+ /* not complete yet, queue the next ordered sequence */
+ rq = queue_next_ordseq(q);
+ } else {
+ /* complete this barrier request */
+ __blk_end_request_all(q->orig_bar_rq, q->orderr);
+ q->orig_bar_rq = NULL;
+ q->ordseq = 0;
+
+ /* dispatch the next barrier if there's one */
+ if (!list_empty(&q->pending_barriers)) {
+ rq = list_entry_rq(q->pending_barriers.next);
+ list_move(&rq->queuelist, &q->queue_head);
+ }
+ }
+ return rq;
}

static void pre_flush_end_io(struct request *rq, int error)
@@ -129,21 +104,10 @@ static void post_flush_end_io(struct req
blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
}

-static void queue_flush(struct request_queue *q, unsigned which)
+static void queue_flush(struct request_queue *q, struct request *rq,
+ rq_end_io_fn *end_io)
{
- struct request *rq;
- rq_end_io_fn *end_io;
-
- if (which == QUEUE_ORDERED_DO_PREFLUSH) {
- rq = &q->pre_flush_rq;
- end_io = pre_flush_end_io;
- } else {
- rq = &q->post_flush_rq;
- end_io = post_flush_end_io;
- }
-
blk_rq_init(q, rq);
- rq->cmd_flags = REQ_HARDBARRIER;
rq->rq_disk = q->bar_rq.rq_disk;
rq->end_io = end_io;
q->prepare_flush_fn(q, rq);
@@ -151,132 +115,93 @@ static void queue_flush(struct request_q
elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}

-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+static struct request *queue_next_ordseq(struct request_queue *q)
{
- struct request *rq = *rqp;
- unsigned skip = 0;
+ struct request *rq = &q->bar_rq;

- q->orderr = 0;
- q->ordered = q->next_ordered;
- q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
- /*
- * For an empty barrier, there's no actual BAR request, which
- * in turn makes POSTFLUSH unnecessary. Mask them off.
- */
- if (!blk_rq_sectors(rq)) {
- q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
- QUEUE_ORDERED_DO_POSTFLUSH);
- /*
- * Empty barrier on a write-through device w/ ordered
- * tag has no command to issue and without any command
- * to issue, ordering by tag can't be used. Drain
- * instead.
- */
- if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
- !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
- q->ordered &= ~QUEUE_ORDERED_BY_TAG;
- q->ordered |= QUEUE_ORDERED_BY_DRAIN;
- }
- }
-
- /* stash away the original request */
- blk_dequeue_request(rq);
- q->orig_bar_rq = rq;
- rq = NULL;
-
- /*
- * Queue ordered sequence. As we stack them at the head, we
- * need to queue in reverse order. Note that we rely on that
- * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
- * request gets inbetween ordered sequence.
- */
- if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
- rq = &q->post_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_POSTFLUSH;
-
- if (q->ordered & QUEUE_ORDERED_DO_BAR) {
- rq = &q->bar_rq;
+ switch (blk_ordered_cur_seq(q)) {
+ case QUEUE_ORDSEQ_PREFLUSH:
+ queue_flush(q, rq, pre_flush_end_io);
+ break;

+ case QUEUE_ORDSEQ_BAR:
/* initialize proxy request and queue it */
blk_rq_init(q, rq);
- if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
- rq->cmd_flags |= REQ_RW;
+ init_request_from_bio(rq, q->orig_bar_rq->bio);
+ rq->cmd_flags &= ~REQ_HARDBARRIER;
if (q->ordered & QUEUE_ORDERED_DO_FUA)
rq->cmd_flags |= REQ_FUA;
- init_request_from_bio(rq, q->orig_bar_rq->bio);
rq->end_io = bar_end_io;

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
- } else
- skip |= QUEUE_ORDSEQ_BAR;
+ break;

- if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
- rq = &q->pre_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_PREFLUSH;
+ case QUEUE_ORDSEQ_POSTFLUSH:
+ queue_flush(q, rq, post_flush_end_io);
+ break;

- if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
- rq = NULL;
- else
- skip |= QUEUE_ORDSEQ_DRAIN;
-
- *rqp = rq;
-
- /*
- * Complete skipped sequences. If whole sequence is complete,
- * return false to tell elevator that this request is gone.
- */
- return !blk_ordered_complete_seq(q, skip, 0);
+ default:
+ BUG();
+ }
+ return rq;
}

-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
{
- struct request *rq = *rqp;
- const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+ unsigned skip = 0;

- if (!q->ordseq) {
- if (!is_barrier)
- return true;
-
- if (q->next_ordered != QUEUE_ORDERED_NONE)
- return start_ordered(q, rqp);
- else {
- /*
- * Queue ordering not supported. Terminate
- * with prejudice.
- */
- blk_dequeue_request(rq);
- __blk_end_request_all(rq, -EOPNOTSUPP);
- *rqp = NULL;
- return false;
- }
+ if (!blk_barrier_rq(rq))
+ return rq;
+
+ if (q->ordseq) {
+ /*
+ * Barrier is already in progress and they can't be
+ * processed in parallel. Queue for later processing.
+ */
+ list_move_tail(&rq->queuelist, &q->pending_barriers);
+ return NULL;
+ }
+
+ if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+ /*
+ * Queue ordering not supported. Terminate
+ * with prejudice.
+ */
+ blk_dequeue_request(rq);
+ __blk_end_request_all(rq, -EOPNOTSUPP);
+ return NULL;
}

/*
- * Ordered sequence in progress
+ * Start a new ordered sequence
*/
+ q->orderr = 0;
+ q->ordered = q->next_ordered;
+ q->ordseq |= QUEUE_ORDSEQ_STARTED;

- /* Special requests are not subject to ordering rules. */
- if (!blk_fs_request(rq) &&
- rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
- return true;
-
- if (q->ordered & QUEUE_ORDERED_BY_TAG) {
- /* Ordered by tag. Blocking the next barrier is enough. */
- if (is_barrier && rq != &q->bar_rq)
- *rqp = NULL;
- } else {
- /* Ordered by draining. Wait for turn. */
- WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
- if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
- *rqp = NULL;
- }
+ /*
+ * For an empty barrier, there's no actual BAR request, which
+ * in turn makes POSTFLUSH unnecessary. Mask them off.
+ */
+ if (!blk_rq_sectors(rq))
+ q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+ QUEUE_ORDERED_DO_POSTFLUSH);
+
+ /* stash away the original request */
+ blk_dequeue_request(rq);
+ q->orig_bar_rq = rq;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+ skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+ skip |= QUEUE_ORDSEQ_BAR;
+
+ if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+ skip |= QUEUE_ORDSEQ_POSTFLUSH;

- return true;
+ /* complete skipped sequences and return the first sequence */
+ return blk_ordered_complete_seq(q, skip, 0);
}

static void bio_end_empty_barrier(struct bio *bio, int err)
Index: work/include/linux/blkdev.h
================================================== =================
--- work.orig/include/linux/blkdev.h
+++ work/include/linux/blkdev.h
@@ -106,7 +106,6 @@ enum rq_flag_bits {
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests */
- __REQ_ORDERED_COLOR, /* is before or after barrier */
__REQ_RW_SYNC, /* request is sync (sync write or read) */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_RW_META, /* metadata io request */
@@ -135,7 +134,6 @@ enum rq_flag_bits {
#define REQ_FAILED (1 << __REQ_FAILED)
#define REQ_QUIET (1 << __REQ_QUIET)
#define REQ_PREEMPT (1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
#define REQ_RW_SYNC (1 << __REQ_RW_SYNC)
#define REQ_ALLOCED (1 << __REQ_ALLOCED)
#define REQ_RW_META (1 << __REQ_RW_META)
@@ -437,9 +435,10 @@ struct request_queue
* reserved for flush operations
*/
unsigned int ordered, next_ordered, ordseq;
- int orderr, ordcolor;
- struct request pre_flush_rq, bar_rq, post_flush_rq;
- struct request *orig_bar_rq;
+ int orderr;
+ struct request bar_rq;
+ struct request *orig_bar_rq;
+ struct list_head pending_barriers;

struct mutex sysfs_lock;

@@ -543,49 +542,33 @@ enum {
* Hardbarrier is supported with one of the following methods.
*
* NONE : hardbarrier unsupported
- * DRAIN : ordering by draining is enough
- * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
- * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
- * TAG : ordering by tag is enough
- * TAG_FLUSH : ordering by tag w/ pre and post flushes
- * TAG_FUA : ordering by tag w/ pre flush and FUA write
- */
- QUEUE_ORDERED_BY_DRAIN = 0x01,
- QUEUE_ORDERED_BY_TAG = 0x02,
- QUEUE_ORDERED_DO_PREFLUSH = 0x10,
- QUEUE_ORDERED_DO_BAR = 0x20,
- QUEUE_ORDERED_DO_POSTFLUSH = 0x40,
- QUEUE_ORDERED_DO_FUA = 0x80,
-
- QUEUE_ORDERED_NONE = 0x00,
-
- QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
+ * BAR : writing out barrier is enough
+ * FLUSH : barrier and surrounding pre and post flushes
+ * FUA : FUA barrier w/ pre flush
+ */
+ QUEUE_ORDERED_DO_PREFLUSH = 1 << 0,
+ QUEUE_ORDERED_DO_BAR = 1 << 1,
+ QUEUE_ORDERED_DO_POSTFLUSH = 1 << 2,
+ QUEUE_ORDERED_DO_FUA = 1 << 3,
+
+ QUEUE_ORDERED_NONE = 0,

- QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
+ QUEUE_ORDERED_BAR = QUEUE_ORDERED_DO_BAR,
+ QUEUE_ORDERED_FLUSH = QUEUE_ORDERED_DO_BAR |
QUEUE_ORDERED_DO_PREFLUSH |
QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
+ QUEUE_ORDERED_FUA = QUEUE_ORDERED_DO_BAR |
QUEUE_ORDERED_DO_PREFLUSH |
QUEUE_ORDERED_DO_FUA,

/*
* Ordered operation sequence
*/
- QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
- QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
- QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
- QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
- QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
- QUEUE_ORDSEQ_DONE = 0x20,
+ QUEUE_ORDSEQ_STARTED = (1 << 0), /* flushing in progress */
+ QUEUE_ORDSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
+ QUEUE_ORDSEQ_BAR = (1 << 2), /* barrier write in progress */
+ QUEUE_ORDSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
+ QUEUE_ORDSEQ_DONE = (1 << 4),
};

#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -967,10 +950,8 @@ extern void blk_queue_rq_timed_out(struc
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
extern unsigned blk_ordered_cur_seq(struct request_queue *);
extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);

extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
Index: work/drivers/block/brd.c
================================================== =================
--- work.orig/drivers/block/brd.c
+++ work/drivers/block/brd.c
@@ -479,7 +479,7 @@ static struct brd_device *brd_alloc(int
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
+ blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_BAR, NULL);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);

Index: work/drivers/block/virtio_blk.c
================================================== =================
--- work.orig/drivers/block/virtio_blk.c
+++ work/drivers/block/virtio_blk.c
@@ -368,10 +368,10 @@ static int __devinit virtblk_probe(struc

/* If barriers are supported, tell block layer that queue is ordered */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH,
+ blk_queue_ordered(q, QUEUE_ORDERED_FLUSH,
virtblk_prepare_flush);
else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
- blk_queue_ordered(q, QUEUE_ORDERED_TAG, NULL);
+ blk_queue_ordered(q, QUEUE_ORDERED_BAR, NULL);

/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
Index: work/drivers/scsi/sd.c
================================================== =================
--- work.orig/drivers/scsi/sd.c
+++ work/drivers/scsi/sd.c
@@ -2103,15 +2103,13 @@ static int sd_revalidate_disk(struct gen

/*
* We now have all cache related info, determine how we deal
- * with ordered requests. Note that as the current SCSI
- * dispatch function can alter request order, we cannot use
- * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+ * with ordered requests.
*/
if (sdkp->WCE)
ordered = sdkp->DPOFUA
- ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
+ ? QUEUE_ORDERED_FUA : QUEUE_ORDERED_FLUSH;
else
- ordered = QUEUE_ORDERED_DRAIN;
+ ordered = QUEUE_ORDERED_BAR;

blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);

Index: work/block/blk-core.c
================================================== =================
--- work.orig/block/blk-core.c
+++ work/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_no
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->pending_barriers);
INIT_WORK(&q->unplug_work, blk_unplug_work);

kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1036,22 +1037,6 @@ void blk_insert_request(struct request_q
}
EXPORT_SYMBOL(blk_insert_request);

-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
- drive_stat_acct(req, 1);
-
- /*
- * elevator indicated where it wants this request to be
- * inserted at elevator_merge time
- */
- __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
@@ -1184,6 +1169,7 @@ static int __make_request(struct request
const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ int where = ELEVATOR_INSERT_SORT;
int rw_flags;

if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
@@ -1191,6 +1177,7 @@ static int __make_request(struct request
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
+
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1200,7 +1187,12 @@ static int __make_request(struct request

spin_lock_irq(q->queue_lock);

- if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
+ if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
+ where = ELEVATOR_INSERT_ORDERED;
+ goto get_rq;
+ }
+
+ if (elv_queue_empty(q))
goto get_rq;

el_ret = elv_merge(q, &req, bio);
@@ -1297,7 +1289,10 @@ get_rq:
req->cpu = blk_cpu_to_group(smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
- add_request(q, req);
+
+ /* insert the request into the elevator */
+ drive_stat_acct(req, 1);
+ __elv_add_request(q, req, where, 0);
out:
if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
Index: work/block/elevator.c
================================================== =================
--- work.orig/block/elevator.c
+++ work/block/elevator.c
@@ -564,7 +564,7 @@ void elv_requeue_request(struct request_

rq->cmd_flags &= ~REQ_STARTED;

- elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+ elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}

void elv_drain_elevator(struct request_queue *q)
@@ -611,8 +611,6 @@ void elv_quiesce_end(struct request_queu

void elv_insert(struct request_queue *q, struct request *rq, int where)
{
- struct list_head *pos;
- unsigned ordseq;
int unplug_it = 1;

trace_block_rq_insert(q, rq);
@@ -622,10 +620,14 @@ void elv_insert(struct request_queue *q,
switch (where) {
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
-
list_add(&rq->queuelist, &q->queue_head);
break;

+ case ELEVATOR_INSERT_ORDERED:
+ rq->cmd_flags |= REQ_SOFTBARRIER;
+ list_add_tail(&rq->queuelist, &q->queue_head);
+ break;
+
case ELEVATOR_INSERT_BACK:
rq->cmd_flags |= REQ_SOFTBARRIER;
elv_drain_elevator(q);
@@ -661,36 +663,6 @@ void elv_insert(struct request_queue *q,
q->elevator->ops->elevator_add_req_fn(q, rq);
break;

- case ELEVATOR_INSERT_REQUEUE:
- /*
- * If ordered flush isn't in progress, we do front
- * insertion; otherwise, requests should be requeued
- * in ordseq order.
- */
- rq->cmd_flags |= REQ_SOFTBARRIER;
-
- /*
- * Most requeues happen because of a busy condition,
- * don't force unplug of the queue for that case.
- */
- unplug_it = 0;
-
- if (q->ordseq == 0) {
- list_add(&rq->queuelist, &q->queue_head);
- break;
- }
-
- ordseq = blk_ordered_req_seq(rq);
-
- list_for_each(pos, &q->queue_head) {
- struct request *pos_rq = list_entry_rq(pos);
- if (ordseq <= blk_ordered_req_seq(pos_rq))
- break;
- }
-
- list_add_tail(&rq->queuelist, pos);
- break;
-
default:
printk(KERN_ERR "%s: bad insertion point %d
",
__func__, where);
@@ -709,32 +681,14 @@ void elv_insert(struct request_queue *q,
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
int plug)
{
- if (q->ordcolor)
- rq->cmd_flags |= REQ_ORDERED_COLOR;
-
if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
- /*
- * toggle ordered color
- */
- if (blk_barrier_rq(rq))
- q->ordcolor ^= 1;
-
- /*
- * barriers implicitly indicate back insertion
- */
- if (where == ELEVATOR_INSERT_SORT)
- where = ELEVATOR_INSERT_BACK;
-
- /*
- * this request is scheduling boundary, update
- * end_sector
- */
+ /* barriers are scheduling boundary, update end_sector */
if (blk_fs_request(rq) || blk_discard_rq(rq)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
- where == ELEVATOR_INSERT_SORT)
+ where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;

if (plug)
@@ -846,24 +800,6 @@ void elv_completed_request(struct reques
if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
e->ops->elevator_completed_req_fn(q, rq);
}
-
- /*
- * Check if the queue is waiting for fs requests to be
- * drained for flush sequence.
- */
- if (unlikely(q->ordseq)) {
- struct request *next = NULL;
-
- if (!list_empty(&q->queue_head))
- next = list_entry_rq(q->queue_head.next);
-
- if (!queue_in_flight(q) &&
- blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
- (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
- blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
- __blk_run_queue(q);
- }
- }
}

#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
Index: work/block/blk.h
================================================== =================
--- work.orig/block/blk.h
+++ work/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))

+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
@@ -58,7 +60,8 @@ static inline struct request *__elv_next
while (1) {
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (blk_do_ordered(q, &rq))
+ rq = blk_do_ordered(q, rq);
+ if (rq)
return rq;
}

Index: work/drivers/block/loop.c
================================================== =================
--- work.orig/drivers/block/loop.c
+++ work/drivers/block/loop.c
@@ -831,7 +831,7 @@ static int loop_set_fd(struct loop_devic
lo->lo_queue->unplug_fn = loop_unplug;

if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_BAR, NULL);

set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
Index: work/drivers/block/osdblk.c
================================================== =================
--- work.orig/drivers/block/osdblk.c
+++ work/drivers/block/osdblk.c
@@ -446,7 +446,7 @@ static int osdblk_init_disk(struct osdbl
blk_queue_stack_limits(q, osd_request_queue(osdev->osd));

blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, osdblk_prepare_flush);
+ blk_queue_ordered(q, QUEUE_ORDERED_FLUSH, osdblk_prepare_flush);

disk->queue = q;

Index: work/drivers/block/ps3disk.c
================================================== =================
--- work.orig/drivers/block/ps3disk.c
+++ work/drivers/block/ps3disk.c
@@ -480,8 +480,7 @@ static int __devinit ps3disk_probe(struc
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);

- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
- ps3disk_prepare_flush);
+ blk_queue_ordered(queue, QUEUE_ORDERED_FLUSH, ps3disk_prepare_flush);

blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
Index: work/drivers/block/xen-blkfront.c
================================================== =================
--- work.orig/drivers/block/xen-blkfront.c
+++ work/drivers/block/xen-blkfront.c
@@ -373,7 +373,7 @@ static int xlvbd_barrier(struct blkfront
int err;

err = blk_queue_ordered(info->rq,
- info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+ info->feature_barrier ? QUEUE_ORDERED_BAR : QUEUE_ORDERED_NONE,
NULL);

if (err)
Index: work/drivers/ide/ide-disk.c
================================================== =================
--- work.orig/drivers/ide/ide-disk.c
+++ work/drivers/ide/ide-disk.c
@@ -537,11 +537,11 @@ static void update_ordered(ide_drive_t *
drive->name, barrier ? "" : "not ");

if (barrier) {
- ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+ ordered = QUEUE_ORDERED_FLUSH;
prep_fn = idedisk_prepare_flush;
}
} else
- ordered = QUEUE_ORDERED_DRAIN;
+ ordered = QUEUE_ORDERED_BAR;

blk_queue_ordered(drive->queue, ordered, prep_fn);
}
Index: work/drivers/md/dm.c
================================================== =================
--- work.orig/drivers/md/dm.c
+++ work/drivers/md/dm.c
@@ -1912,8 +1912,7 @@ static struct mapped_device *alloc_dev(i
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
- dm_rq_prepare_flush);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_FLUSH, dm_rq_prepare_flush);

md->disk = alloc_disk(1);
if (!md->disk)
Index: work/drivers/mmc/card/queue.c
================================================== =================
--- work.orig/drivers/mmc/card/queue.c
+++ work/drivers/mmc/card/queue.c
@@ -128,7 +128,7 @@ int mmc_init_queue(struct mmc_queue *mq,
mq->req = NULL;

blk_queue_prep_rq(mq->queue, mmc_prep_request);
- blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(mq->queue, QUEUE_ORDERED_BAR, NULL);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);

#ifdef CONFIG_MMC_BLOCK_BOUNCE
Index: work/drivers/s390/block/dasd.c
================================================== =================
--- work.orig/drivers/s390/block/dasd.c
+++ work/drivers/s390/block/dasd.c
@@ -2196,7 +2196,7 @@ static void dasd_setup_queue(struct dasd
*/
blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
- blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN, NULL);
+ blk_queue_ordered(block->request_queue, QUEUE_ORDERED_BAR, NULL);
}

/*
Index: work/include/linux/elevator.h
================================================== =================
--- work.orig/include/linux/elevator.h
+++ work/include/linux/elevator.h
@@ -162,9 +162,9 @@ extern struct request *elv_rb_find(struc
* Insertion selection
*/
#define ELEVATOR_INSERT_FRONT 1
-#define ELEVATOR_INSERT_BACK 2
-#define ELEVATOR_INSERT_SORT 3
-#define ELEVATOR_INSERT_REQUEUE 4
+#define ELEVATOR_INSERT_ORDERED 2
+#define ELEVATOR_INSERT_BACK 3
+#define ELEVATOR_INSERT_SORT 4

/*
* return values from elevator_may_queue_fn
Index: work/drivers/block/pktcdvd.c
================================================== =================
--- work.orig/drivers/block/pktcdvd.c
+++ work/drivers/block/pktcdvd.c
@@ -752,7 +752,6 @@ static int pkt_generic_packet(struct pkt

rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 08-08-2010, 02:31 PM
Christoph Hellwig
 
Default relaxed barriers

On Sat, Aug 07, 2010 at 12:13:06PM +0200, Tejun Heo wrote:
> The patch was on top of v2.6.35 but was generated against dirty tree
> and wouldn't apply cleanly. Here's the proper one.

Here's an updated version:

(a) ported to Jens' current block tree
(b) optimize barriers on devices not requiring flushes to be no-ops
(b) redo the blk_queue_ordered interface to just set QUEUE_HAS_FLUSH
and QUEUE_HAS_FUA flags.

Index: linux-2.6/block/blk-barrier.c
================================================== =================
--- linux-2.6.orig/block/blk-barrier.c 2010-08-07 12:53:23.727479189 -0400
+++ linux-2.6/block/blk-barrier.c 2010-08-07 14:52:21.402479191 -0400
@@ -9,37 +9,36 @@

#include "blk.h"

+/*
+ * Ordered operation sequence.
+ */
+enum {
+ QUEUE_ORDSEQ_STARTED = (1 << 0), /* flushing in progress */
+ QUEUE_ORDSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
+ QUEUE_ORDSEQ_BAR = (1 << 2), /* barrier write in progress */
+ QUEUE_ORDSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
+ QUEUE_ORDSEQ_DONE = (1 << 4),
+};
+
+static struct request *queue_next_ordseq(struct request_queue *q);
+
/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q: the request queue
- * @ordered: one of QUEUE_ORDERED_*
- *
- * Description:
- * For journalled file systems, doing ordered writes on a commit
- * block instead of explicitly doing wait_on_buffer (which is bad
- * for performance) can be a big win. Block drivers supporting this
- * feature should call this function and indicate so.
- *
+ * blk_queue_cache_features - set the supported cache control features
+ * @q: the request queue
+ * @cache_features: the support features
**/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
+int blk_queue_cache_features(struct request_queue *q, unsigned cache_features)
{
- if (ordered != QUEUE_ORDERED_NONE &&
- ordered != QUEUE_ORDERED_DRAIN &&
- ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
- ordered != QUEUE_ORDERED_DRAIN_FUA &&
- ordered != QUEUE_ORDERED_TAG &&
- ordered != QUEUE_ORDERED_TAG_FLUSH &&
- ordered != QUEUE_ORDERED_TAG_FUA) {
- printk(KERN_ERR "blk_queue_ordered: bad value %d
", ordered);
+ if (cache_features & ~(QUEUE_HAS_FLUSH|QUEUE_HAS_FUA)) {
+ printk(KERN_ERR "blk_queue_cache_features: bad value %d
",
+ cache_features);
return -EINVAL;
}

- q->ordered = ordered;
- q->next_ordered = ordered;
-
+ q->cache_features = cache_features;
return 0;
}
-EXPORT_SYMBOL(blk_queue_ordered);
+EXPORT_SYMBOL(blk_queue_cache_features);

/*
* Cache flushing for ordered writes handling
@@ -51,38 +50,10 @@ unsigned blk_ordered_cur_seq(struct requ
return 1 << ffz(q->ordseq);
}

-unsigned blk_ordered_req_seq(struct request *rq)
-{
- struct request_queue *q = rq->q;
-
- BUG_ON(q->ordseq == 0);
-
- if (rq == &q->pre_flush_rq)
- return QUEUE_ORDSEQ_PREFLUSH;
- if (rq == &q->bar_rq)
- return QUEUE_ORDSEQ_BAR;
- if (rq == &q->post_flush_rq)
- return QUEUE_ORDSEQ_POSTFLUSH;
-
- /*
- * !fs requests don't need to follow barrier ordering. Always
- * put them at the front. This fixes the following deadlock.
- *
- * http://thread.gmane.org/gmane.linux.kernel/537473
- */
- if (rq->cmd_type != REQ_TYPE_FS)
- return QUEUE_ORDSEQ_DRAIN;
-
- if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
- (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
- return QUEUE_ORDSEQ_DRAIN;
- else
- return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+ unsigned seq, int error)
{
- struct request *rq;
+ struct request *rq = NULL;

if (error && !q->orderr)
q->orderr = error;
@@ -90,16 +61,22 @@ bool blk_ordered_complete_seq(struct req
BUG_ON(q->ordseq & seq);
q->ordseq |= seq;

- if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
- return false;
-
- /*
- * Okay, sequence complete.
- */
- q->ordseq = 0;
- rq = q->orig_bar_rq;
- __blk_end_request_all(rq, q->orderr);
- return true;
+ if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+ /* not complete yet, queue the next ordered sequence */
+ rq = queue_next_ordseq(q);
+ } else {
+ /* complete this barrier request */
+ __blk_end_request_all(q->orig_bar_rq, q->orderr);
+ q->orig_bar_rq = NULL;
+ q->ordseq = 0;
+
+ /* dispatch the next barrier if there's one */
+ if (!list_empty(&q->pending_barriers)) {
+ rq = list_entry_rq(q->pending_barriers.next);
+ list_move(&rq->queuelist, &q->queue_head);
+ }
+ }
+ return rq;
}

static void pre_flush_end_io(struct request *rq, int error)
@@ -120,155 +97,100 @@ static void post_flush_end_io(struct req
blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
}

-static void queue_flush(struct request_queue *q, unsigned which)
+static void init_flush_request(struct request_queue *q, struct request *rq)
{
- struct request *rq;
- rq_end_io_fn *end_io;
+ rq->cmd_type = REQ_TYPE_FS;
+ rq->cmd_flags = REQ_FLUSH;
+ rq->rq_disk = q->orig_bar_rq->rq_disk;
+}

- if (which == QUEUE_ORDERED_DO_PREFLUSH) {
- rq = &q->pre_flush_rq;
- end_io = pre_flush_end_io;
- } else {
- rq = &q->post_flush_rq;
- end_io = post_flush_end_io;
- }
+/*
+ * Initialize proxy request and queue it.
+ */
+static struct request *queue_next_ordseq(struct request_queue *q)
+{
+ struct request *rq = &q->bar_rq;

blk_rq_init(q, rq);
- rq->cmd_type = REQ_TYPE_FS;
- rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
- rq->rq_disk = q->orig_bar_rq->rq_disk;
- rq->end_io = end_io;
+
+ switch (blk_ordered_cur_seq(q)) {
+ case QUEUE_ORDSEQ_PREFLUSH:
+ init_flush_request(q, rq);
+ rq->end_io = pre_flush_end_io;
+ break;
+ case QUEUE_ORDSEQ_BAR:
+ init_request_from_bio(rq, q->orig_bar_rq->bio);
+ rq->cmd_flags &= ~REQ_HARDBARRIER;
+ if (q->cache_features & QUEUE_HAS_FUA)
+ rq->cmd_flags |= REQ_FUA;
+ rq->end_io = bar_end_io;
+ break;
+ case QUEUE_ORDSEQ_POSTFLUSH:
+ init_flush_request(q, rq);
+ rq->end_io = post_flush_end_io;
+ break;
+ default:
+ BUG();
+ }

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+ return rq;
}

-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
{
- struct request *rq = *rqp;
unsigned skip = 0;

- q->orderr = 0;
- q->ordered = q->next_ordered;
- q->ordseq |= QUEUE_ORDSEQ_STARTED;
+ if (rq->cmd_type != REQ_TYPE_FS)
+ return rq;
+ if (!(rq->cmd_flags & REQ_HARDBARRIER))
+ return rq;

- /*
- * For an empty barrier, there's no actual BAR request, which
- * in turn makes POSTFLUSH unnecessary. Mask them off.
- */
- if (!blk_rq_sectors(rq)) {
- q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
- QUEUE_ORDERED_DO_POSTFLUSH);
+ if (!(q->cache_features & QUEUE_HAS_FLUSH)) {
/*
- * Empty barrier on a write-through device w/ ordered
- * tag has no command to issue and without any command
- * to issue, ordering by tag can't be used. Drain
- * instead.
+ * No flush required. We can just send on write requests
+ * and complete cache flush requests ASAP.
*/
- if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
- !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
- q->ordered &= ~QUEUE_ORDERED_BY_TAG;
- q->ordered |= QUEUE_ORDERED_BY_DRAIN;
+ if (blk_rq_sectors(rq)) {
+ rq->cmd_flags &= ~REQ_HARDBARRIER;
+ return rq;
}
+ blk_dequeue_request(rq);
+ __blk_end_request_all(rq, 0);
+ return NULL;
}

- /* stash away the original request */
- blk_dequeue_request(rq);
- q->orig_bar_rq = rq;
- rq = NULL;
-
- /*
- * Queue ordered sequence. As we stack them at the head, we
- * need to queue in reverse order. Note that we rely on that
- * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
- * request gets inbetween ordered sequence.
- */
- if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
- rq = &q->post_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_POSTFLUSH;
-
- if (q->ordered & QUEUE_ORDERED_DO_BAR) {
- rq = &q->bar_rq;
-
- /* initialize proxy request and queue it */
- blk_rq_init(q, rq);
- if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
- rq->cmd_flags |= REQ_WRITE;
- if (q->ordered & QUEUE_ORDERED_DO_FUA)
- rq->cmd_flags |= REQ_FUA;
- init_request_from_bio(rq, q->orig_bar_rq->bio);
- rq->end_io = bar_end_io;
-
- elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
- } else
- skip |= QUEUE_ORDSEQ_BAR;
-
- if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
- queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
- rq = &q->pre_flush_rq;
- } else
- skip |= QUEUE_ORDSEQ_PREFLUSH;
-
- if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
- rq = NULL;
- else
- skip |= QUEUE_ORDSEQ_DRAIN;
+ if (q->ordseq) {
+ /*
+ * Barrier is already in progress and they can't be
+ * processed in parallel. Queue for later processing.
+ */
+ list_move_tail(&rq->queuelist, &q->pending_barriers);
+ return NULL;
+ }

- *rqp = rq;

/*
- * Complete skipped sequences. If whole sequence is complete,
- * return false to tell elevator that this request is gone.
+ * Start a new ordered sequence
*/
- return !blk_ordered_complete_seq(q, skip, 0);
-}
-
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
-{
- struct request *rq = *rqp;
- const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
- (rq->cmd_flags & REQ_HARDBARRIER);
-
- if (!q->ordseq) {
- if (!is_barrier)
- return true;
-
- if (q->next_ordered != QUEUE_ORDERED_NONE)
- return start_ordered(q, rqp);
- else {
- /*
- * Queue ordering not supported. Terminate
- * with prejudice.
- */
- blk_dequeue_request(rq);
- __blk_end_request_all(rq, -EOPNOTSUPP);
- *rqp = NULL;
- return false;
- }
- }
+ q->orderr = 0;
+ q->ordseq |= QUEUE_ORDSEQ_STARTED;

/*
- * Ordered sequence in progress
+ * For an empty barrier, there's no actual BAR request, which
+ * in turn makes POSTFLUSH unnecessary. Mask them off.
*/
+ if (!blk_rq_sectors(rq))
+ skip |= (QUEUE_ORDSEQ_BAR|QUEUE_ORDSEQ_POSTFLUSH);
+ else if (q->cache_features & QUEUE_HAS_FUA)
+ skip |= QUEUE_ORDSEQ_POSTFLUSH;

- /* Special requests are not subject to ordering rules. */
- if (rq->cmd_type != REQ_TYPE_FS &&
- rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
- return true;
-
- if (q->ordered & QUEUE_ORDERED_BY_TAG) {
- /* Ordered by tag. Blocking the next barrier is enough. */
- if (is_barrier && rq != &q->bar_rq)
- *rqp = NULL;
- } else {
- /* Ordered by draining. Wait for turn. */
- WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
- if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
- *rqp = NULL;
- }
+ /* stash away the original request */
+ blk_dequeue_request(rq);
+ q->orig_bar_rq = rq;

- return true;
+ /* complete skipped sequences and return the first sequence */
+ return blk_ordered_complete_seq(q, skip, 0);
}

static void bio_end_empty_barrier(struct bio *bio, int err)
Index: linux-2.6/include/linux/blkdev.h
================================================== =================
--- linux-2.6.orig/include/linux/blkdev.h 2010-08-07 12:53:23.774479189 -0400
+++ linux-2.6/include/linux/blkdev.h 2010-08-07 14:51:42.751479190 -0400
@@ -354,13 +354,20 @@ struct request_queue
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
+
+ /*
+ * Features this queue understands.
+ */
+ unsigned int cache_features;
+
/*
* reserved for flush operations
*/
- unsigned int ordered, next_ordered, ordseq;
- int orderr, ordcolor;
- struct request pre_flush_rq, bar_rq, post_flush_rq;
- struct request *orig_bar_rq;
+ unsigned int ordseq;
+ int orderr;
+ struct request bar_rq;
+ struct request *orig_bar_rq;
+ struct list_head pending_barriers;

struct mutex sysfs_lock;

@@ -461,54 +468,12 @@ static inline void queue_flag_clear(unsi
__clear_bit(flag, &q->queue_flags);
}

+/*
+ * Possible features to control a volatile write cache.
+ */
enum {
- /*
- * Hardbarrier is supported with one of the following methods.
- *
- * NONE : hardbarrier unsupported
- * DRAIN : ordering by draining is enough
- * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
- * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
- * TAG : ordering by tag is enough
- * TAG_FLUSH : ordering by tag w/ pre and post flushes
- * TAG_FUA : ordering by tag w/ pre flush and FUA write
- */
- QUEUE_ORDERED_BY_DRAIN = 0x01,
- QUEUE_ORDERED_BY_TAG = 0x02,
- QUEUE_ORDERED_DO_PREFLUSH = 0x10,
- QUEUE_ORDERED_DO_BAR = 0x20,
- QUEUE_ORDERED_DO_POSTFLUSH = 0x40,
- QUEUE_ORDERED_DO_FUA = 0x80,
-
- QUEUE_ORDERED_NONE = 0x00,
-
- QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
-
- QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG |
- QUEUE_ORDERED_DO_BAR,
- QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_POSTFLUSH,
- QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
- QUEUE_ORDERED_DO_PREFLUSH |
- QUEUE_ORDERED_DO_FUA,
-
- /*
- * Ordered operation sequence
- */
- QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
- QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
- QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
- QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
- QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
- QUEUE_ORDSEQ_DONE = 0x20,
+ QUEUE_HAS_FLUSH = 1 << 0, /* supports REQ_FLUSH */
+ QUEUE_HAS_FUA = 1 << 1, /* supports REQ_FUA */
};

#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -879,11 +844,9 @@ extern void blk_queue_softirq_done(struc
extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
+extern int blk_queue_cache_features(struct request_queue *, unsigned);
extern unsigned blk_ordered_cur_seq(struct request_queue *);
extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);

extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
Index: linux-2.6/drivers/block/virtio_blk.c
================================================== =================
--- linux-2.6.orig/drivers/block/virtio_blk.c 2010-08-07 12:53:23.800479189 -0400
+++ linux-2.6/drivers/block/virtio_blk.c 2010-08-07 14:51:34.198479189 -0400
@@ -388,31 +388,8 @@ static int __devinit virtblk_probe(struc
vblk->disk->driverfs_dev = &vdev->dev;
index++;

- if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
- /*
- * If the FLUSH feature is supported we do have support for
- * flushing a volatile write cache on the host. Use that
- * to implement write barrier support.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
- } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
- /*
- * If the BARRIER feature is supported the host expects us
- * to order request by tags. This implies there is not
- * volatile write cache on the host, and that the host
- * never re-orders outstanding I/O. This feature is not
- * useful for real life scenarious and deprecated.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_TAG);
- } else {
- /*
- * If the FLUSH feature is not supported we must assume that
- * the host does not perform any kind of volatile write
- * caching. We still need to drain the queue to provider
- * proper barrier semantics.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
- }
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+ blk_queue_cache_features(q, QUEUE_HAS_FLUSH);

/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
Index: linux-2.6/drivers/scsi/sd.c
================================================== =================
--- linux-2.6.orig/drivers/scsi/sd.c 2010-08-07 12:53:23.872479189 -0400
+++ linux-2.6/drivers/scsi/sd.c 2010-08-07 14:54:47.812479189 -0400
@@ -2109,7 +2109,7 @@ static int sd_revalidate_disk(struct gen
struct scsi_disk *sdkp = scsi_disk(disk);
struct scsi_device *sdp = sdkp->device;
unsigned char *buffer;
- unsigned ordered;
+ unsigned ordered = 0;

SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
"sd_revalidate_disk
"));
@@ -2151,17 +2151,14 @@ static int sd_revalidate_disk(struct gen

/*
* We now have all cache related info, determine how we deal
- * with ordered requests. Note that as the current SCSI
- * dispatch function can alter request order, we cannot use
- * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+ * with barriers.
*/
- if (sdkp->WCE)
- ordered = sdkp->DPOFUA
- ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
- else
- ordered = QUEUE_ORDERED_DRAIN;
-
- blk_queue_ordered(sdkp->disk->queue, ordered);
+ if (sdkp->WCE) {
+ ordered |= QUEUE_HAS_FLUSH;
+ if (sdkp->DPOFUA)
+ ordered |= QUEUE_HAS_FUA;
+ }
+ blk_queue_cache_features(sdkp->disk->queue, ordered);

set_capacity(disk, sdkp->capacity);
kfree(buffer);
Index: linux-2.6/block/blk-core.c
================================================== =================
--- linux-2.6.orig/block/blk-core.c 2010-08-07 12:53:23.744479189 -0400
+++ linux-2.6/block/blk-core.c 2010-08-07 14:56:35.087479189 -0400
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_no
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->pending_barriers);
INIT_WORK(&q->unplug_work, blk_unplug_work);

kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1037,22 +1038,6 @@ void blk_insert_request(struct request_q
}
EXPORT_SYMBOL(blk_insert_request);

-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
- drive_stat_acct(req, 1);
-
- /*
- * elevator indicated where it wants this request to be
- * inserted at elevator_merge time
- */
- __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
@@ -1201,13 +1186,9 @@ static int __make_request(struct request
const bool sync = (bio->bi_rw & REQ_SYNC);
const bool unplug = (bio->bi_rw & REQ_UNPLUG);
const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ int where = ELEVATOR_INSERT_SORT;
int rw_flags;

- if ((bio->bi_rw & REQ_HARDBARRIER) &&
- (q->next_ordered == QUEUE_ORDERED_NONE)) {
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
- }
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1217,7 +1198,12 @@ static int __make_request(struct request

spin_lock_irq(q->queue_lock);

- if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+ if (bio->bi_rw & REQ_HARDBARRIER) {
+ where = ELEVATOR_INSERT_ORDERED;
+ goto get_rq;
+ }
+
+ if (elv_queue_empty(q))
goto get_rq;

el_ret = elv_merge(q, &req, bio);
@@ -1314,7 +1300,10 @@ get_rq:
req->cpu = blk_cpu_to_group(smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
- add_request(q, req);
+
+ /* insert the request into the elevator */
+ drive_stat_acct(req, 1);
+ __elv_add_request(q, req, where, 0);
out:
if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
Index: linux-2.6/block/elevator.c
================================================== =================
--- linux-2.6.orig/block/elevator.c 2010-08-07 12:53:23.752479189 -0400
+++ linux-2.6/block/elevator.c 2010-08-07 12:53:53.162479190 -0400
@@ -564,7 +564,7 @@ void elv_requeue_request(struct request_

rq->cmd_flags &= ~REQ_STARTED;

- elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+ elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
}

void elv_drain_elevator(struct request_queue *q)
@@ -611,8 +611,6 @@ void elv_quiesce_end(struct request_queu

void elv_insert(struct request_queue *q, struct request *rq, int where)
{
- struct list_head *pos;
- unsigned ordseq;
int unplug_it = 1;

trace_block_rq_insert(q, rq);
@@ -622,10 +620,14 @@ void elv_insert(struct request_queue *q,
switch (where) {
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
-
list_add(&rq->queuelist, &q->queue_head);
break;

+ case ELEVATOR_INSERT_ORDERED:
+ rq->cmd_flags |= REQ_SOFTBARRIER;
+ list_add_tail(&rq->queuelist, &q->queue_head);
+ break;
+
case ELEVATOR_INSERT_BACK:
rq->cmd_flags |= REQ_SOFTBARRIER;
elv_drain_elevator(q);
@@ -662,36 +664,6 @@ void elv_insert(struct request_queue *q,
q->elevator->ops->elevator_add_req_fn(q, rq);
break;

- case ELEVATOR_INSERT_REQUEUE:
- /*
- * If ordered flush isn't in progress, we do front
- * insertion; otherwise, requests should be requeued
- * in ordseq order.
- */
- rq->cmd_flags |= REQ_SOFTBARRIER;
-
- /*
- * Most requeues happen because of a busy condition,
- * don't force unplug of the queue for that case.
- */
- unplug_it = 0;
-
- if (q->ordseq == 0) {
- list_add(&rq->queuelist, &q->queue_head);
- break;
- }
-
- ordseq = blk_ordered_req_seq(rq);
-
- list_for_each(pos, &q->queue_head) {
- struct request *pos_rq = list_entry_rq(pos);
- if (ordseq <= blk_ordered_req_seq(pos_rq))
- break;
- }
-
- list_add_tail(&rq->queuelist, pos);
- break;
-
default:
printk(KERN_ERR "%s: bad insertion point %d
",
__func__, where);
@@ -710,33 +682,15 @@ void elv_insert(struct request_queue *q,
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
int plug)
{
- if (q->ordcolor)
- rq->cmd_flags |= REQ_ORDERED_COLOR;
-
if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
- /*
- * toggle ordered color
- */
- if (rq->cmd_flags & REQ_HARDBARRIER)
- q->ordcolor ^= 1;
-
- /*
- * barriers implicitly indicate back insertion
- */
- if (where == ELEVATOR_INSERT_SORT)
- where = ELEVATOR_INSERT_BACK;
-
- /*
- * this request is scheduling boundary, update
- * end_sector
- */
+ /* barriers are scheduling boundary, update end_sector */
if (rq->cmd_type == REQ_TYPE_FS ||
(rq->cmd_flags & REQ_DISCARD)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
- where == ELEVATOR_INSERT_SORT)
+ where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;

if (plug)
@@ -849,24 +803,6 @@ void elv_completed_request(struct reques
e->ops->elevator_completed_req_fn)
e->ops->elevator_completed_req_fn(q, rq);
}
-
- /*
- * Check if the queue is waiting for fs requests to be
- * drained for flush sequence.
- */
- if (unlikely(q->ordseq)) {
- struct request *next = NULL;
-
- if (!list_empty(&q->queue_head))
- next = list_entry_rq(q->queue_head.next);
-
- if (!queue_in_flight(q) &&
- blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
- (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
- blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
- __blk_run_queue(q);
- }
- }
}

#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
Index: linux-2.6/block/blk.h
================================================== =================
--- linux-2.6.orig/block/blk.h 2010-08-07 12:53:23.762479189 -0400
+++ linux-2.6/block/blk.h 2010-08-07 12:53:53.171479190 -0400
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))

+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
@@ -58,7 +60,8 @@ static inline struct request *__elv_next
while (1) {
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (blk_do_ordered(q, &rq))
+ rq = blk_do_ordered(q, rq);
+ if (rq)
return rq;
}

Index: linux-2.6/drivers/block/xen-blkfront.c
================================================== =================
--- linux-2.6.orig/drivers/block/xen-blkfront.c 2010-08-07 12:53:23.807479189 -0400
+++ linux-2.6/drivers/block/xen-blkfront.c 2010-08-07 14:44:39.564479189 -0400
@@ -417,30 +417,6 @@ static int xlvbd_init_blk_queue(struct g
return 0;
}

-
-static int xlvbd_barrier(struct blkfront_info *info)
-{
- int err;
- const char *barrier;
-
- switch (info->feature_barrier) {
- case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
- case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
- case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
- default: return -EINVAL;
- }
-
- err = blk_queue_ordered(info->rq, info->feature_barrier);
-
- if (err)
- return err;
-
- printk(KERN_INFO "blkfront: %s: barriers %s
",
- info->gd->disk_name, barrier);
- return 0;
-}
-
-
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
u16 vdisk_info, u16 sector_size)
@@ -516,8 +492,6 @@ static int xlvbd_alloc_gendisk(blkif_sec
info->rq = gd->queue;
info->gd = gd;

- xlvbd_barrier(info);
-
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);

@@ -662,8 +636,6 @@ static irqreturn_t blkif_interrupt(int i
printk(KERN_WARNING "blkfront: %s: write barrier op failed
",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = QUEUE_ORDERED_NONE;
- xlvbd_barrier(info);
}
/* fall through */
case BLKIF_OP_READ:
@@ -1073,24 +1045,6 @@ static void blkfront_connect(struct blkf
"feature-barrier", "%lu", &barrier,
NULL);

- /*
- * If there's no "feature-barrier" defined, then it means
- * we're dealing with a very old backend which writes
- * synchronously; draining will do what needs to get done.
- *
- * If there are barriers, then we can do full queued writes
- * with tagged barriers.
- *
- * If barriers are not supported, then there's no much we can
- * do, so just set ordering to NONE.
- */
- if (err)
- info->feature_barrier = QUEUE_ORDERED_DRAIN;
- else if (barrier)
- info->feature_barrier = QUEUE_ORDERED_TAG;
- else
- info->feature_barrier = QUEUE_ORDERED_NONE;
-
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
Index: linux-2.6/drivers/ide/ide-disk.c
================================================== =================
--- linux-2.6.orig/drivers/ide/ide-disk.c 2010-08-07 12:53:23.889479189 -0400
+++ linux-2.6/drivers/ide/ide-disk.c 2010-08-07 15:00:30.215479189 -0400
@@ -518,12 +518,13 @@ static int ide_do_setfeature(ide_drive_t

static void update_ordered(ide_drive_t *drive)
{
- u16 *id = drive->id;
- unsigned ordered = QUEUE_ORDERED_NONE;
+ unsigned ordered = 0;

if (drive->dev_flags & IDE_DFLAG_WCACHE) {
+ u16 *id = drive->id;
unsigned long long capacity;
int barrier;
+
/*
* We must avoid issuing commands a drive does not
* understand or we may crash it. We check flush cache
@@ -543,13 +544,18 @@ static void update_ordered(ide_drive_t *
drive->name, barrier ? "" : "not ");

if (barrier) {
- ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+ printk(KERN_INFO "%s: cache flushes supported
",
+ drive->name);
blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
+ ordered |= QUEUE_HAS_FLUSH;
+ } else {
+ printk(KERN_INFO
+ "%s: WARNING: cache flushes not supported
",
+ drive->name);
}
- } else
- ordered = QUEUE_ORDERED_DRAIN;
+ }

- blk_queue_ordered(drive->queue, ordered);
+ blk_queue_cache_features(drive->queue, ordered);
}

ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
Index: linux-2.6/drivers/md/dm.c
================================================== =================
--- linux-2.6.orig/drivers/md/dm.c 2010-08-07 12:53:23.905479189 -0400
+++ linux-2.6/drivers/md/dm.c 2010-08-07 14:51:38.240479189 -0400
@@ -1908,7 +1908,7 @@ static struct mapped_device *alloc_dev(i
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_cache_features(md->queue, QUEUE_HAS_FLUSH);

md->disk = alloc_disk(1);
if (!md->disk)
Index: linux-2.6/drivers/mmc/card/queue.c
================================================== =================
--- linux-2.6.orig/drivers/mmc/card/queue.c 2010-08-07 12:53:23.927479189 -0400
+++ linux-2.6/drivers/mmc/card/queue.c 2010-08-07 14:30:09.666479189 -0400
@@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq,
mq->req = NULL;

blk_queue_prep_rq(mq->queue, mmc_prep_request);
- blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);

#ifdef CONFIG_MMC_BLOCK_BOUNCE
Index: linux-2.6/drivers/s390/block/dasd.c
================================================== =================
--- linux-2.6.orig/drivers/s390/block/dasd.c 2010-08-07 12:53:23.939479189 -0400
+++ linux-2.6/drivers/s390/block/dasd.c 2010-08-07 14:30:13.307479189 -0400
@@ -2197,7 +2197,6 @@ static void dasd_setup_queue(struct dasd
*/
blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
- blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
}

/*
Index: linux-2.6/include/linux/elevator.h
================================================== =================
--- linux-2.6.orig/include/linux/elevator.h 2010-08-07 12:53:23.781479189 -0400
+++ linux-2.6/include/linux/elevator.h 2010-08-07 12:53:53.208479190 -0400
@@ -162,9 +162,9 @@ extern struct request *elv_rb_find(struc
* Insertion selection
*/
#define ELEVATOR_INSERT_FRONT 1
-#define ELEVATOR_INSERT_BACK 2
-#define ELEVATOR_INSERT_SORT 3
-#define ELEVATOR_INSERT_REQUEUE 4
+#define ELEVATOR_INSERT_ORDERED 2
+#define ELEVATOR_INSERT_BACK 3
+#define ELEVATOR_INSERT_SORT 4

/*
* return values from elevator_may_queue_fn
Index: linux-2.6/drivers/block/pktcdvd.c
================================================== =================
--- linux-2.6.orig/drivers/block/pktcdvd.c 2010-08-07 12:53:23.815479189 -0400
+++ linux-2.6/drivers/block/pktcdvd.c 2010-08-07 12:53:53.211479190 -0400
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pkt

rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;

Index: linux-2.6/drivers/block/brd.c
================================================== =================
--- linux-2.6.orig/drivers/block/brd.c 2010-08-07 12:53:23.825479189 -0400
+++ linux-2.6/drivers/block/brd.c 2010-08-07 14:26:12.293479191 -0400
@@ -482,7 +482,6 @@ static struct brd_device *brd_alloc(int
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);

Index: linux-2.6/drivers/block/loop.c
================================================== =================
--- linux-2.6.orig/drivers/block/loop.c 2010-08-07 12:53:23.836479189 -0400
+++ linux-2.6/drivers/block/loop.c 2010-08-07 14:51:27.937479189 -0400
@@ -831,8 +831,8 @@ static int loop_set_fd(struct loop_devic
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;

- if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+ /* XXX(hch): loop can't properly deal with flush requests currently */
+// blk_queue_cache_features(lo->lo_queue, QUEUE_HAS_FLUSH);

set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
Index: linux-2.6/drivers/block/osdblk.c
================================================== =================
--- linux-2.6.orig/drivers/block/osdblk.c 2010-08-07 12:53:23.843479189 -0400
+++ linux-2.6/drivers/block/osdblk.c 2010-08-07 14:51:30.091479189 -0400
@@ -439,7 +439,7 @@ static int osdblk_init_disk(struct osdbl
blk_queue_stack_limits(q, osd_request_queue(osdev->osd));

blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_cache_features(q, QUEUE_HAS_FLUSH);

disk->queue = q;

Index: linux-2.6/drivers/block/ps3disk.c
================================================== =================
--- linux-2.6.orig/drivers/block/ps3disk.c 2010-08-07 12:53:23.859479189 -0400
+++ linux-2.6/drivers/block/ps3disk.c 2010-08-07 14:51:32.204479189 -0400
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struc
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);

- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_cache_features(queue, QUEUE_HAS_FLUSH);

blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
Index: linux-2.6/include/linux/blk_types.h
================================================== =================
--- linux-2.6.orig/include/linux/blk_types.h 2010-08-07 12:53:23.793479189 -0400
+++ linux-2.6/include/linux/blk_types.h 2010-08-07 12:53:53.243479190 -0400
@@ -141,7 +141,6 @@ enum rq_flag_bits {
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests */
- __REQ_ORDERED_COLOR, /* is before or after barrier */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_COPY_USER, /* contains copies of user pages */
__REQ_INTEGRITY, /* integrity metadata has been remapped */
@@ -181,7 +180,6 @@ enum rq_flag_bits {
#define REQ_FAILED (1 << __REQ_FAILED)
#define REQ_QUIET (1 << __REQ_QUIET)
#define REQ_PREEMPT (1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
#define REQ_ALLOCED (1 << __REQ_ALLOCED)
#define REQ_COPY_USER (1 << __REQ_COPY_USER)
#define REQ_INTEGRITY (1 << __REQ_INTEGRITY)

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 
Old 08-09-2010, 02:50 PM
Tejun Heo
 
Default relaxed barriers

On 08/08/2010 04:31 PM, Christoph Hellwig wrote:
> On Sat, Aug 07, 2010 at 12:13:06PM +0200, Tejun Heo wrote:
>> The patch was on top of v2.6.35 but was generated against dirty tree
>> and wouldn't apply cleanly. Here's the proper one.
>
> Here's an updated version:
>
> (a) ported to Jens' current block tree
> (b) optimize barriers on devices not requiring flushes to be no-ops
> (b) redo the blk_queue_ordered interface to just set QUEUE_HAS_FLUSH
> and QUEUE_HAS_FUA flags.

Nice. I'm working on a properly split patchset implementing
REQ_FLUSH/FUA based interface, which replaces REQ_HARDBARRIER. Empty
request w/ REQ_FLUSH just flushes cache but has no other ordering
restrictions. REQ_FLUSH + data means preflush + data write. REQ_FUA
+ data means data would be committed to NV media on completion.
REQ_FLUSH + FUA + data means preflush + NV data write. All FLUSH/FUA
requests w/ data are ordered only against each other. I think I'll be
able to post in several days.

Thanks.

--
tejun

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
 

Thread Tools




All times are GMT. The time now is 09:51 AM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.
Copyright 2007 - 2008, www.linux-archive.org