4 files changed, 154 insertions, 15 deletions
diff --git a/Documentation/block/row-iosched.txt b/Documentation/block/row-iosched.txt
new file mode 100644
index 000000000..987bd8834
--- /dev/null
+++ b/Documentation/block/row-iosched.txt
@@ -0,0 +1,117 @@
+Introduction
+============
+
+The ROW scheduling algorithm will be used in mobile devices as default
+block layer IO scheduling algorithm. ROW stands for "READ Over WRITE"
+which is the main requests dispatch policy of this algorithm.
+
+The ROW IO scheduler was developed with the mobile devices needs in
+mind. In mobile devices we favor user experience upon everything else,
+thus we want to give READ IO requests as much priority as possible.
+The main idea of the ROW scheduling policy is:
+If there are READ requests in pipe - dispatch them but don't starve
+the WRITE requests too much.
+
+Software description
+====================
+The requests are kept in queues according to their priority. The
+dispatching of requests is done in a Round Robin manner with a
+different slice for each queue. The dispatch quantum for a specific
+queue is defined according to the queues priority. READ queues are
+given bigger dispatch quantum than the WRITE queues, within a dispatch
+cycle.
+
+At the moment there are 6 types of queues the requests are
+distributed to:
+-	High priority READ queue
+-	High priority Synchronous WRITE queue
+-	Regular priority READ queue
+-	Regular priority Synchronous WRITE queue
+-	Regular priority WRITE queue
+-	Low priority READ queue
+
+If in a certain dispatch cycle one of the queues was empty and didn't
+use its quantum that queue will be marked as "un-served". If we're in a
+middle of a dispatch cycle dispatching from queue Y and a request
+arrives for queue X that was un-served in the previous cycle, if X's
+priority is higher than Y's, queue X will be preempted in the favor of
+queue Y. This won't mean that cycle is restarted. The "dispatched"
+counter of queue X will remain unchanged. Once queue Y uses up it's quantum
+(or there will be no more requests left on it) we'll switch back to queue X
+and allow it to finish it's quantum.
+
+For READ requests queues we allow idling in within a dispatch quantum in
+order to give the application a chance to insert more requests. Idling
+means adding some extra time for serving a certain queue even if the
+queue is empty. The idling is enabled if we identify the application is
+inserting requests in a high frequency.
+
+For idling on READ queues we use timer mechanism. When the timer expires,
+if there are requests in the scheduler we will signal the underlying driver
+(for example the MMC driver) to fetch another request for dispatch.
+
+The ROW algorithm takes the scheduling policy one step further, making
+it a bit more "user-needs oriented", by allowing the application to
+hint on the urgency of its requests. For example: even among the READ
+requests several requests may be more urgent for completion then others.
+The former will go to the High priority READ queue, that is given the
+bigger dispatch quantum than any other queue.
+
+ROW scheduler will support special services for block devices that
+supports High Priority Requests. That is, the scheduler may inform the
+device upon urgent requests using new callback make_urgent_request.
+In addition it will support rescheduling of requests that were
+interrupted. For example, if the device issues a long write request and
+a sudden high priority read interrupt pops in, the scheduler will
+inform the device about the urgent request, so the device can stop the
+current write request and serve the high priority read request. In such
+a case the device may also send back to the scheduler the reminder of
+the interrupted write request, such that the scheduler may continue
+sending high priority requests without the need to interrupt the
+ongoing write again and again. The write remainder will be sent later on
+according to the scheduler policy.
+
+Design
+======
+Existing algorithms (cfq, deadline) sort the io requests according LBA.
+When deciding on the next request to dispatch they choose the closest
+request to the current disk head position (from handling last
+dispatched request). This is done in order to reduce the disk head
+movement to a minimum.
+We feel that this functionality isn't really needed in mobile devices.
+Usually applications that write/read large chunks of data insert the
+requests in already sorted LBA order. Thus dealing with sort trees adds
+unnecessary complexity.
+
+We're planing to try this enhancement in the future to check if the
+performance is influenced by it.
+
+SMP/multi-core
+==============
+At the moment the code is acceded from 2 contexts:
+- Application context (from block/elevator layer): adding the requests.
+- Underlying driver context (for example the mmc driver thread): dispatching
+  the requests and notifying on completion.
+
+One lock is used to synchronize between the two. This lock is provided
+by the underlying driver along with the dispatch queue.
+
+Config options
+==============
+1. hp_read_quantum: dispatch quantum for the high priority READ queue
+2. rp_read_quantum: dispatch quantum for the regular priority READ queue
+3. hp_swrite_quantum: dispatch quantum for the high priority Synchronous
+   WRITE queue
+4. rp_swrite_quantum: dispatch quantum for the regular priority
+   Synchronous WRITE queue
+5. rp_write_quantum: dispatch quantum for the regular priority WRITE
+   queue
+6. lp_read_quantum: dispatch quantum for the low priority READ queue
+7. lp_swrite_quantum: dispatch quantum for the low priority Synchronous
+   WRITE queue
+8. read_idle: how long to idle on read queue in Msec (in case idling
+   is enabled on that queue).
+9. read_idle_freq: frequency of inserting READ requests that will
+   trigger idling. This is the time in Msec between inserting two READ
+   requests
+
diff --git a/block/blk-core.c b/block/blk-core.c
index 14a419c50..40cb3916c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -307,16 +307,20 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	 * number of active request_fn invocations such that blk_drain_queue()
 	 * can wait until all these request_fn calls have finished.
 	 */
-	q->request_fn_active++;
+
 	if (!q->notified_urgent &&
 		q->elevator->type->ops.elevator_is_urgent_fn &&
 		q->urgent_request_fn &&
 		q->elevator->type->ops.elevator_is_urgent_fn(q)) {
 		q->notified_urgent = true;
+		q->request_fn_active++;
 		q->urgent_request_fn(q);
-	} else
+		q->request_fn_active--;
+	} else {
+		q->request_fn_active++;
 		q->request_fn(q);
-	q->request_fn_active--;
+		q->request_fn_active--;
+	}
 }
 
 /**
@@ -1226,6 +1230,16 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 
 	BUG_ON(blk_queued_rq(rq));
 
+	if (rq->cmd_flags & REQ_URGENT) {
+		/*
+		 * It's not compliant with the design to re-insert
+		 * urgent requests. We want to be able to track this
+		 * down.
+		 */
+		pr_err("%s(): requeueing an URGENT request", __func__);
+		WARN_ON(!q->dispatched_urgent);
+		q->dispatched_urgent = false;
+	}
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
@@ -1249,10 +1263,20 @@ int blk_reinsert_request(struct request_queue *q, struct request *rq)
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 
-	if (blk_rq_tagged(rq))
+	if (rq->cmd_flags & REQ_QUEUED)
 		blk_queue_end_tag(q, rq);
 
 	BUG_ON(blk_queued_rq(rq));
+	if (rq->cmd_flags & REQ_URGENT) {
+		/*
+		 * It's not compliant with the design to re-insert
+		 * urgent requests. We want to be able to track this
+		 * down.
+		 */
+		pr_err("%s(): reinserting an URGENT request", __func__);
+		WARN_ON(!q->dispatched_urgent);
+		q->dispatched_urgent = false;
+	}
 
 	return elv_reinsert_request(q, rq);
 }
@@ -2226,6 +2250,10 @@ struct request *blk_peek_request(struct request_queue *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
+			if (rq->cmd_flags & REQ_URGENT) {
+				WARN_ON(q->dispatched_urgent);
+				q->dispatched_urgent = true;
+			}
 			trace_block_rq_issue(q, rq);
 		}
 
diff --git a/block/elevator.c b/block/elevator.c
index 445718cc6..a4d6e54fe 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -786,10 +786,10 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (test_bit(REQ_ATOM_URGENT, &rq->atomic_flags)) {
+	if (rq->cmd_flags & REQ_URGENT) {
 		q->notified_urgent = false;
+		WARN_ON(!q->dispatched_urgent);
 		q->dispatched_urgent = false;
-		blk_clear_rq_urgent(rq);
 	}
 	/*
 	 * request is released from the driver, io must be done
diff --git a/block/row-iosched.c b/block/row-iosched.c
index 666f4db7f..9d4ee917e 100644
--- a/block/row-iosched.c
+++ b/block/row-iosched.c
@@ -1,7 +1,7 @@
 /*
  * ROW (Read Over Write) I/O scheduler.
  *
- * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -331,10 +331,6 @@ static void row_add_request(struct request_queue *q,
 	struct row_queue *rqueue = RQ_ROWQ(rq);
 	s64 diff_ms;
 	bool queue_was_empty = list_empty(&rqueue->fifo);
-	unsigned long bv_page_flags = 0;
-
-	if (rq->bio && rq->bio->bi_io_vec && rq->bio->bi_io_vec->bv_page)
-		bv_page_flags = rq->bio->bi_io_vec->bv_page->flags;
 
 	list_add_tail(&rq->queuelist, &rqueue->fifo);
 	rd->nr_reqs[rq_data_dir(rq)]++;
@@ -367,9 +363,7 @@ static void row_add_request(struct request_queue *q,
 			rqueue->idle_data.begin_idling = false;
 			return;
 		}
-
-		if ((bv_page_flags & (1L << PG_readahead)) ||
-		    (diff_ms < rd->rd_idle_data.freq_ms)) {
+		if (diff_ms < rd->rd_idle_data.freq_ms) {
 			rqueue->idle_data.begin_idling = true;
 			row_log_rowq(rd, rqueue->prio, "Enable idling");
 		} else {
@@ -806,7 +800,6 @@ static int row_init_queue(struct request_queue *q, struct elevator_type *e)
 		return -ENOMEM;
 	}
 	eq->elevator_data = rdata;
-
 	memset(rdata, 0, sizeof(*rdata));
 	for (i = 0; i < ROWQ_MAX_PRIO; i++) {
 		INIT_LIST_HEAD(&rdata->row_queues[i].fifo);
@@ -837,6 +830,7 @@ static int row_init_queue(struct request_queue *q, struct elevator_type *e)
 	rdata->last_served_ioprio_class = IOPRIO_CLASS_NONE;
 	rdata->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO;
 	rdata->dispatch_queue = q;
+
 	spin_lock_irq(q->queue_lock);
 	q->elevator = eq;
 	spin_unlock_irq(q->queue_lock);