This adds a set of hooks that intercepts the blk-mq path of allocating/inserting/issuing/completing requests, allowing us to develop a scheduler within that framework. We reuse the existing elevator scheduler API on the registration side, but augment that with the scheduler flagging support for the blk-mq interfce, and with a separate set of ops hooks for MQ devices. We split driver and scheduler tags, so we can run the scheduling independently of device queue depth. Signed-off-by: Jens Axboe <axboe@fb.com> Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com> Reviewed-by: Omar Sandoval <osandov@fb.com>tirimbino
parent
2af8cbe305
commit
bd166ef183
@ -0,0 +1,368 @@ |
||||
/*
|
||||
* blk-mq scheduling framework |
||||
* |
||||
* Copyright (C) 2016 Jens Axboe |
||||
*/ |
||||
#include <linux/kernel.h> |
||||
#include <linux/module.h> |
||||
#include <linux/blk-mq.h> |
||||
|
||||
#include <trace/events/block.h> |
||||
|
||||
#include "blk.h" |
||||
#include "blk-mq.h" |
||||
#include "blk-mq-sched.h" |
||||
#include "blk-mq-tag.h" |
||||
#include "blk-wbt.h" |
||||
|
||||
void blk_mq_sched_free_hctx_data(struct request_queue *q, |
||||
void (*exit)(struct blk_mq_hw_ctx *)) |
||||
{ |
||||
struct blk_mq_hw_ctx *hctx; |
||||
int i; |
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) { |
||||
if (exit && hctx->sched_data) |
||||
exit(hctx); |
||||
kfree(hctx->sched_data); |
||||
hctx->sched_data = NULL; |
||||
} |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
||||
|
||||
int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, |
||||
int (*init)(struct blk_mq_hw_ctx *), |
||||
void (*exit)(struct blk_mq_hw_ctx *)) |
||||
{ |
||||
struct blk_mq_hw_ctx *hctx; |
||||
int ret; |
||||
int i; |
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) { |
||||
hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); |
||||
if (!hctx->sched_data) { |
||||
ret = -ENOMEM; |
||||
goto error; |
||||
} |
||||
|
||||
if (init) { |
||||
ret = init(hctx); |
||||
if (ret) { |
||||
/*
|
||||
* We don't want to give exit() a partially |
||||
* initialized sched_data. init() must clean up |
||||
* if it fails. |
||||
*/ |
||||
kfree(hctx->sched_data); |
||||
hctx->sched_data = NULL; |
||||
goto error; |
||||
} |
||||
} |
||||
} |
||||
|
||||
return 0; |
||||
error: |
||||
blk_mq_sched_free_hctx_data(q, exit); |
||||
return ret; |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); |
||||
|
||||
static void __blk_mq_sched_assign_ioc(struct request_queue *q, |
||||
struct request *rq, struct io_context *ioc) |
||||
{ |
||||
struct io_cq *icq; |
||||
|
||||
spin_lock_irq(q->queue_lock); |
||||
icq = ioc_lookup_icq(ioc, q); |
||||
spin_unlock_irq(q->queue_lock); |
||||
|
||||
if (!icq) { |
||||
icq = ioc_create_icq(ioc, q, GFP_ATOMIC); |
||||
if (!icq) |
||||
return; |
||||
} |
||||
|
||||
rq->elv.icq = icq; |
||||
if (!blk_mq_sched_get_rq_priv(q, rq)) { |
||||
rq->rq_flags |= RQF_ELVPRIV; |
||||
get_io_context(icq->ioc); |
||||
return; |
||||
} |
||||
|
||||
rq->elv.icq = NULL; |
||||
} |
||||
|
||||
static void blk_mq_sched_assign_ioc(struct request_queue *q, |
||||
struct request *rq, struct bio *bio) |
||||
{ |
||||
struct io_context *ioc; |
||||
|
||||
ioc = rq_ioc(bio); |
||||
if (ioc) |
||||
__blk_mq_sched_assign_ioc(q, rq, ioc); |
||||
} |
||||
|
||||
struct request *blk_mq_sched_get_request(struct request_queue *q, |
||||
struct bio *bio, |
||||
unsigned int op, |
||||
struct blk_mq_alloc_data *data) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
struct blk_mq_hw_ctx *hctx; |
||||
struct blk_mq_ctx *ctx; |
||||
struct request *rq; |
||||
const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA); |
||||
|
||||
blk_queue_enter_live(q); |
||||
ctx = blk_mq_get_ctx(q); |
||||
hctx = blk_mq_map_queue(q, ctx->cpu); |
||||
|
||||
blk_mq_set_alloc_data(data, q, 0, ctx, hctx); |
||||
|
||||
if (e) { |
||||
data->flags |= BLK_MQ_REQ_INTERNAL; |
||||
|
||||
/*
|
||||
* Flush requests are special and go directly to the |
||||
* dispatch list. |
||||
*/ |
||||
if (!is_flush && e->type->ops.mq.get_request) { |
||||
rq = e->type->ops.mq.get_request(q, op, data); |
||||
if (rq) |
||||
rq->rq_flags |= RQF_QUEUED; |
||||
} else |
||||
rq = __blk_mq_alloc_request(data, op); |
||||
} else { |
||||
rq = __blk_mq_alloc_request(data, op); |
||||
data->hctx->tags->rqs[rq->tag] = rq; |
||||
} |
||||
|
||||
if (rq) { |
||||
if (!is_flush) { |
||||
rq->elv.icq = NULL; |
||||
if (e && e->type->icq_cache) |
||||
blk_mq_sched_assign_ioc(q, rq, bio); |
||||
} |
||||
data->hctx->queued++; |
||||
return rq; |
||||
} |
||||
|
||||
blk_queue_exit(q); |
||||
return NULL; |
||||
} |
||||
|
||||
void blk_mq_sched_put_request(struct request *rq) |
||||
{ |
||||
struct request_queue *q = rq->q; |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (rq->rq_flags & RQF_ELVPRIV) { |
||||
blk_mq_sched_put_rq_priv(rq->q, rq); |
||||
if (rq->elv.icq) { |
||||
put_io_context(rq->elv.icq->ioc); |
||||
rq->elv.icq = NULL; |
||||
} |
||||
} |
||||
|
||||
if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) |
||||
e->type->ops.mq.put_request(rq); |
||||
else |
||||
blk_mq_finish_request(rq); |
||||
} |
||||
|
||||
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
||||
{ |
||||
struct elevator_queue *e = hctx->queue->elevator; |
||||
LIST_HEAD(rq_list); |
||||
|
||||
if (unlikely(blk_mq_hctx_stopped(hctx))) |
||||
return; |
||||
|
||||
hctx->run++; |
||||
|
||||
/*
|
||||
* If we have previous entries on our dispatch list, grab them first for |
||||
* more fair dispatch. |
||||
*/ |
||||
if (!list_empty_careful(&hctx->dispatch)) { |
||||
spin_lock(&hctx->lock); |
||||
if (!list_empty(&hctx->dispatch)) |
||||
list_splice_init(&hctx->dispatch, &rq_list); |
||||
spin_unlock(&hctx->lock); |
||||
} |
||||
|
||||
/*
|
||||
* Only ask the scheduler for requests, if we didn't have residual |
||||
* requests from the dispatch list. This is to avoid the case where |
||||
* we only ever dispatch a fraction of the requests available because |
||||
* of low device queue depth. Once we pull requests out of the IO |
||||
* scheduler, we can no longer merge or sort them. So it's best to |
||||
* leave them there for as long as we can. Mark the hw queue as |
||||
* needing a restart in that case. |
||||
*/ |
||||
if (list_empty(&rq_list)) { |
||||
if (e && e->type->ops.mq.dispatch_requests) |
||||
e->type->ops.mq.dispatch_requests(hctx, &rq_list); |
||||
else |
||||
blk_mq_flush_busy_ctxs(hctx, &rq_list); |
||||
} else |
||||
blk_mq_sched_mark_restart(hctx); |
||||
|
||||
blk_mq_dispatch_rq_list(hctx, &rq_list); |
||||
} |
||||
|
||||
void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, |
||||
struct list_head *rq_list, |
||||
struct request *(*get_rq)(struct blk_mq_hw_ctx *)) |
||||
{ |
||||
do { |
||||
struct request *rq; |
||||
|
||||
rq = get_rq(hctx); |
||||
if (!rq) |
||||
break; |
||||
|
||||
list_add_tail(&rq->queuelist, rq_list); |
||||
} while (1); |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); |
||||
|
||||
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio) |
||||
{ |
||||
struct request *rq; |
||||
int ret; |
||||
|
||||
ret = elv_merge(q, &rq, bio); |
||||
if (ret == ELEVATOR_BACK_MERGE) { |
||||
if (!blk_mq_sched_allow_merge(q, rq, bio)) |
||||
return false; |
||||
if (bio_attempt_back_merge(q, rq, bio)) { |
||||
if (!attempt_back_merge(q, rq)) |
||||
elv_merged_request(q, rq, ret); |
||||
return true; |
||||
} |
||||
} else if (ret == ELEVATOR_FRONT_MERGE) { |
||||
if (!blk_mq_sched_allow_merge(q, rq, bio)) |
||||
return false; |
||||
if (bio_attempt_front_merge(q, rq, bio)) { |
||||
if (!attempt_front_merge(q, rq)) |
||||
elv_merged_request(q, rq, ret); |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); |
||||
|
||||
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e->type->ops.mq.bio_merge) { |
||||
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
||||
|
||||
blk_mq_put_ctx(ctx); |
||||
return e->type->ops.mq.bio_merge(hctx, bio); |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) |
||||
{ |
||||
return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); |
||||
|
||||
void blk_mq_sched_request_inserted(struct request *rq) |
||||
{ |
||||
trace_block_rq_insert(rq->q, rq); |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); |
||||
|
||||
bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) |
||||
{ |
||||
if (rq->tag == -1) { |
||||
rq->rq_flags |= RQF_SORTED; |
||||
return false; |
||||
} |
||||
|
||||
/*
|
||||
* If we already have a real request tag, send directly to |
||||
* the dispatch list. |
||||
*/ |
||||
spin_lock(&hctx->lock); |
||||
list_add(&rq->queuelist, &hctx->dispatch); |
||||
spin_unlock(&hctx->lock); |
||||
return true; |
||||
} |
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); |
||||
|
||||
static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, |
||||
struct blk_mq_hw_ctx *hctx, |
||||
unsigned int hctx_idx) |
||||
{ |
||||
if (hctx->sched_tags) { |
||||
blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); |
||||
blk_mq_free_rq_map(hctx->sched_tags); |
||||
hctx->sched_tags = NULL; |
||||
} |
||||
} |
||||
|
||||
int blk_mq_sched_setup(struct request_queue *q) |
||||
{ |
||||
struct blk_mq_tag_set *set = q->tag_set; |
||||
struct blk_mq_hw_ctx *hctx; |
||||
int ret, i; |
||||
|
||||
/*
|
||||
* Default to 256, since we don't split into sync/async like the |
||||
* old code did. Additionally, this is a per-hw queue depth. |
||||
*/ |
||||
q->nr_requests = 2 * BLKDEV_MAX_RQ; |
||||
|
||||
/*
|
||||
* We're switching to using an IO scheduler, so setup the hctx |
||||
* scheduler tags and switch the request map from the regular |
||||
* tags to scheduler tags. First allocate what we need, so we |
||||
* can safely fail and fallback, if needed. |
||||
*/ |
||||
ret = 0; |
||||
queue_for_each_hw_ctx(q, hctx, i) { |
||||
hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); |
||||
if (!hctx->sched_tags) { |
||||
ret = -ENOMEM; |
||||
break; |
||||
} |
||||
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); |
||||
if (ret) |
||||
break; |
||||
} |
||||
|
||||
/*
|
||||
* If we failed, free what we did allocate |
||||
*/ |
||||
if (ret) { |
||||
queue_for_each_hw_ctx(q, hctx, i) { |
||||
if (!hctx->sched_tags) |
||||
continue; |
||||
blk_mq_sched_free_tags(set, hctx, i); |
||||
} |
||||
|
||||
return ret; |
||||
} |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
void blk_mq_sched_teardown(struct request_queue *q) |
||||
{ |
||||
struct blk_mq_tag_set *set = q->tag_set; |
||||
struct blk_mq_hw_ctx *hctx; |
||||
int i; |
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) |
||||
blk_mq_sched_free_tags(set, hctx, i); |
||||
} |
@ -0,0 +1,170 @@ |
||||
#ifndef BLK_MQ_SCHED_H |
||||
#define BLK_MQ_SCHED_H |
||||
|
||||
#include "blk-mq.h" |
||||
#include "blk-mq-tag.h" |
||||
|
||||
int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, |
||||
int (*init)(struct blk_mq_hw_ctx *), |
||||
void (*exit)(struct blk_mq_hw_ctx *)); |
||||
|
||||
void blk_mq_sched_free_hctx_data(struct request_queue *q, |
||||
void (*exit)(struct blk_mq_hw_ctx *)); |
||||
|
||||
struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); |
||||
void blk_mq_sched_put_request(struct request *rq); |
||||
|
||||
void blk_mq_sched_request_inserted(struct request *rq); |
||||
bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq); |
||||
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio); |
||||
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); |
||||
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); |
||||
|
||||
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); |
||||
void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, |
||||
struct list_head *rq_list, |
||||
struct request *(*get_rq)(struct blk_mq_hw_ctx *)); |
||||
|
||||
int blk_mq_sched_setup(struct request_queue *q); |
||||
void blk_mq_sched_teardown(struct request_queue *q); |
||||
|
||||
static inline bool |
||||
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) |
||||
return false; |
||||
|
||||
return __blk_mq_sched_bio_merge(q, bio); |
||||
} |
||||
|
||||
static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, |
||||
struct request *rq) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e && e->type->ops.mq.get_rq_priv) |
||||
return e->type->ops.mq.get_rq_priv(q, rq); |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, |
||||
struct request *rq) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e && e->type->ops.mq.put_rq_priv) |
||||
e->type->ops.mq.put_rq_priv(q, rq); |
||||
} |
||||
|
||||
static inline void |
||||
blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, |
||||
bool async) |
||||
{ |
||||
struct request_queue *q = rq->q; |
||||
struct elevator_queue *e = q->elevator; |
||||
struct blk_mq_ctx *ctx = rq->mq_ctx; |
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
||||
|
||||
if (e && e->type->ops.mq.insert_requests) { |
||||
LIST_HEAD(list); |
||||
|
||||
list_add(&rq->queuelist, &list); |
||||
e->type->ops.mq.insert_requests(hctx, &list, at_head); |
||||
} else { |
||||
spin_lock(&ctx->lock); |
||||
__blk_mq_insert_request(hctx, rq, at_head); |
||||
spin_unlock(&ctx->lock); |
||||
} |
||||
|
||||
if (run_queue) |
||||
blk_mq_run_hw_queue(hctx, async); |
||||
} |
||||
|
||||
static inline void |
||||
blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, |
||||
struct list_head *list, bool run_queue_async) |
||||
{ |
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
||||
struct elevator_queue *e = hctx->queue->elevator; |
||||
|
||||
if (e && e->type->ops.mq.insert_requests) |
||||
e->type->ops.mq.insert_requests(hctx, list, false); |
||||
else |
||||
blk_mq_insert_requests(hctx, ctx, list); |
||||
|
||||
blk_mq_run_hw_queue(hctx, run_queue_async); |
||||
} |
||||
|
||||
static inline bool |
||||
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, |
||||
struct bio *bio) |
||||
{ |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e && e->type->ops.mq.allow_merge) |
||||
return e->type->ops.mq.allow_merge(q, rq, bio); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
static inline void |
||||
blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) |
||||
{ |
||||
struct elevator_queue *e = hctx->queue->elevator; |
||||
|
||||
if (e && e->type->ops.mq.completed_request) |
||||
e->type->ops.mq.completed_request(hctx, rq); |
||||
|
||||
BUG_ON(rq->internal_tag == -1); |
||||
|
||||
blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); |
||||
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { |
||||
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
||||
blk_mq_run_hw_queue(hctx, true); |
||||
} |
||||
} |
||||
|
||||
static inline void blk_mq_sched_started_request(struct request *rq) |
||||
{ |
||||
struct request_queue *q = rq->q; |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e && e->type->ops.mq.started_request) |
||||
e->type->ops.mq.started_request(rq); |
||||
} |
||||
|
||||
static inline void blk_mq_sched_requeue_request(struct request *rq) |
||||
{ |
||||
struct request_queue *q = rq->q; |
||||
struct elevator_queue *e = q->elevator; |
||||
|
||||
if (e && e->type->ops.mq.requeue_request) |
||||
e->type->ops.mq.requeue_request(rq); |
||||
} |
||||
|
||||
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) |
||||
{ |
||||
struct elevator_queue *e = hctx->queue->elevator; |
||||
|
||||
if (e && e->type->ops.mq.has_work) |
||||
return e->type->ops.mq.has_work(hctx); |
||||
|
||||
return false; |
||||
} |
||||
|
||||
static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) |
||||
{ |
||||
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) |
||||
set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
||||
} |
||||
|
||||
static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) |
||||
{ |
||||
return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
||||
} |
||||
|
||||
#endif |
Loading…
Reference in new issue