diff options
| -rw-r--r-- | Documentation/block/row-iosched.txt | 117 | ||||
| -rw-r--r-- | block/blk-core.c | 36 | ||||
| -rw-r--r-- | block/elevator.c | 4 | ||||
| -rw-r--r-- | block/row-iosched.c | 12 |
4 files changed, 154 insertions, 15 deletions
diff --git a/Documentation/block/row-iosched.txt b/Documentation/block/row-iosched.txt new file mode 100644 index 000000000..987bd8834 --- /dev/null +++ b/Documentation/block/row-iosched.txt @@ -0,0 +1,117 @@ +Introduction +============ + +The ROW scheduling algorithm will be used in mobile devices as default +block layer IO scheduling algorithm. ROW stands for "READ Over WRITE" +which is the main requests dispatch policy of this algorithm. + +The ROW IO scheduler was developed with the mobile devices needs in +mind. In mobile devices we favor user experience upon everything else, +thus we want to give READ IO requests as much priority as possible. +The main idea of the ROW scheduling policy is: +If there are READ requests in pipe - dispatch them but don't starve +the WRITE requests too much. + +Software description +==================== +The requests are kept in queues according to their priority. The +dispatching of requests is done in a Round Robin manner with a +different slice for each queue. The dispatch quantum for a specific +queue is defined according to the queues priority. READ queues are +given bigger dispatch quantum than the WRITE queues, within a dispatch +cycle. + +At the moment there are 6 types of queues the requests are +distributed to: +- High priority READ queue +- High priority Synchronous WRITE queue +- Regular priority READ queue +- Regular priority Synchronous WRITE queue +- Regular priority WRITE queue +- Low priority READ queue + +If in a certain dispatch cycle one of the queues was empty and didn't +use its quantum that queue will be marked as "un-served". If we're in a +middle of a dispatch cycle dispatching from queue Y and a request +arrives for queue X that was un-served in the previous cycle, if X's +priority is higher than Y's, queue X will be preempted in the favor of +queue Y. This won't mean that cycle is restarted. The "dispatched" +counter of queue X will remain unchanged. Once queue Y uses up it's quantum +(or there will be no more requests left on it) we'll switch back to queue X +and allow it to finish it's quantum. + +For READ requests queues we allow idling in within a dispatch quantum in +order to give the application a chance to insert more requests. Idling +means adding some extra time for serving a certain queue even if the +queue is empty. The idling is enabled if we identify the application is +inserting requests in a high frequency. + +For idling on READ queues we use timer mechanism. When the timer expires, +if there are requests in the scheduler we will signal the underlying driver +(for example the MMC driver) to fetch another request for dispatch. + +The ROW algorithm takes the scheduling policy one step further, making +it a bit more "user-needs oriented", by allowing the application to +hint on the urgency of its requests. For example: even among the READ +requests several requests may be more urgent for completion then others. +The former will go to the High priority READ queue, that is given the +bigger dispatch quantum than any other queue. + +ROW scheduler will support special services for block devices that +supports High Priority Requests. That is, the scheduler may inform the +device upon urgent requests using new callback make_urgent_request. +In addition it will support rescheduling of requests that were +interrupted. For example, if the device issues a long write request and +a sudden high priority read interrupt pops in, the scheduler will +inform the device about the urgent request, so the device can stop the +current write request and serve the high priority read request. In such +a case the device may also send back to the scheduler the reminder of +the interrupted write request, such that the scheduler may continue +sending high priority requests without the need to interrupt the +ongoing write again and again. The write remainder will be sent later on +according to the scheduler policy. + +Design +====== +Existing algorithms (cfq, deadline) sort the io requests according LBA. +When deciding on the next request to dispatch they choose the closest +request to the current disk head position (from handling last +dispatched request). This is done in order to reduce the disk head +movement to a minimum. +We feel that this functionality isn't really needed in mobile devices. +Usually applications that write/read large chunks of data insert the +requests in already sorted LBA order. Thus dealing with sort trees adds +unnecessary complexity. + +We're planing to try this enhancement in the future to check if the +performance is influenced by it. + +SMP/multi-core +============== +At the moment the code is acceded from 2 contexts: +- Application context (from block/elevator layer): adding the requests. +- Underlying driver context (for example the mmc driver thread): dispatching + the requests and notifying on completion. + +One lock is used to synchronize between the two. This lock is provided +by the underlying driver along with the dispatch queue. + +Config options +============== +1. hp_read_quantum: dispatch quantum for the high priority READ queue +2. rp_read_quantum: dispatch quantum for the regular priority READ queue +3. hp_swrite_quantum: dispatch quantum for the high priority Synchronous + WRITE queue +4. rp_swrite_quantum: dispatch quantum for the regular priority + Synchronous WRITE queue +5. rp_write_quantum: dispatch quantum for the regular priority WRITE + queue +6. lp_read_quantum: dispatch quantum for the low priority READ queue +7. lp_swrite_quantum: dispatch quantum for the low priority Synchronous + WRITE queue +8. read_idle: how long to idle on read queue in Msec (in case idling + is enabled on that queue). +9. read_idle_freq: frequency of inserting READ requests that will + trigger idling. This is the time in Msec between inserting two READ + requests + diff --git a/block/blk-core.c b/block/blk-core.c index 14a419c50..40cb3916c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -307,16 +307,20 @@ inline void __blk_run_queue_uncond(struct request_queue *q) * number of active request_fn invocations such that blk_drain_queue() * can wait until all these request_fn calls have finished. */ - q->request_fn_active++; + if (!q->notified_urgent && q->elevator->type->ops.elevator_is_urgent_fn && q->urgent_request_fn && q->elevator->type->ops.elevator_is_urgent_fn(q)) { q->notified_urgent = true; + q->request_fn_active++; q->urgent_request_fn(q); - } else + q->request_fn_active--; + } else { + q->request_fn_active++; q->request_fn(q); - q->request_fn_active--; + q->request_fn_active--; + } } /** @@ -1226,6 +1230,16 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) BUG_ON(blk_queued_rq(rq)); + if (rq->cmd_flags & REQ_URGENT) { + /* + * It's not compliant with the design to re-insert + * urgent requests. We want to be able to track this + * down. + */ + pr_err("%s(): requeueing an URGENT request", __func__); + WARN_ON(!q->dispatched_urgent); + q->dispatched_urgent = false; + } elv_requeue_request(q, rq); } EXPORT_SYMBOL(blk_requeue_request); @@ -1249,10 +1263,20 @@ int blk_reinsert_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (blk_rq_tagged(rq)) + if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); + if (rq->cmd_flags & REQ_URGENT) { + /* + * It's not compliant with the design to re-insert + * urgent requests. We want to be able to track this + * down. + */ + pr_err("%s(): reinserting an URGENT request", __func__); + WARN_ON(!q->dispatched_urgent); + q->dispatched_urgent = false; + } return elv_reinsert_request(q, rq); } @@ -2226,6 +2250,10 @@ struct request *blk_peek_request(struct request_queue *q) * not be passed by new incoming requests */ rq->cmd_flags |= REQ_STARTED; + if (rq->cmd_flags & REQ_URGENT) { + WARN_ON(q->dispatched_urgent); + q->dispatched_urgent = true; + } trace_block_rq_issue(q, rq); } diff --git a/block/elevator.c b/block/elevator.c index 445718cc6..a4d6e54fe 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -786,10 +786,10 @@ void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (test_bit(REQ_ATOM_URGENT, &rq->atomic_flags)) { + if (rq->cmd_flags & REQ_URGENT) { q->notified_urgent = false; + WARN_ON(!q->dispatched_urgent); q->dispatched_urgent = false; - blk_clear_rq_urgent(rq); } /* * request is released from the driver, io must be done diff --git a/block/row-iosched.c b/block/row-iosched.c index 666f4db7f..9d4ee917e 100644 --- a/block/row-iosched.c +++ b/block/row-iosched.c @@ -1,7 +1,7 @@ /* * ROW (Read Over Write) I/O scheduler. * - * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved. + * Copyright (c) 2012-2014, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -331,10 +331,6 @@ static void row_add_request(struct request_queue *q, struct row_queue *rqueue = RQ_ROWQ(rq); s64 diff_ms; bool queue_was_empty = list_empty(&rqueue->fifo); - unsigned long bv_page_flags = 0; - - if (rq->bio && rq->bio->bi_io_vec && rq->bio->bi_io_vec->bv_page) - bv_page_flags = rq->bio->bi_io_vec->bv_page->flags; list_add_tail(&rq->queuelist, &rqueue->fifo); rd->nr_reqs[rq_data_dir(rq)]++; @@ -367,9 +363,7 @@ static void row_add_request(struct request_queue *q, rqueue->idle_data.begin_idling = false; return; } - - if ((bv_page_flags & (1L << PG_readahead)) || - (diff_ms < rd->rd_idle_data.freq_ms)) { + if (diff_ms < rd->rd_idle_data.freq_ms) { rqueue->idle_data.begin_idling = true; row_log_rowq(rd, rqueue->prio, "Enable idling"); } else { @@ -806,7 +800,6 @@ static int row_init_queue(struct request_queue *q, struct elevator_type *e) return -ENOMEM; } eq->elevator_data = rdata; - memset(rdata, 0, sizeof(*rdata)); for (i = 0; i < ROWQ_MAX_PRIO; i++) { INIT_LIST_HEAD(&rdata->row_queues[i].fifo); @@ -837,6 +830,7 @@ static int row_init_queue(struct request_queue *q, struct elevator_type *e) rdata->last_served_ioprio_class = IOPRIO_CLASS_NONE; rdata->rd_idle_data.idling_queue_idx = ROWQ_MAX_PRIO; rdata->dispatch_queue = q; + spin_lock_irq(q->queue_lock); q->elevator = eq; spin_unlock_irq(q->queue_lock); |
