^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * blk-mq scheduling framework
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2016 Jens Axboe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/blk-mq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/list_sort.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <trace/events/block.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include "blk.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include "blk-mq.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include "blk-mq-debugfs.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include "blk-mq-sched.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include "blk-mq-tag.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include "blk-wbt.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) void blk_mq_sched_assign_ioc(struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) struct request_queue *q = rq->q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) struct io_context *ioc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) struct io_cq *icq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * May not have an IO context if it's a passthrough request
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) ioc = current->io_context;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) if (!ioc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) spin_lock_irq(&q->queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) icq = ioc_lookup_icq(ioc, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) spin_unlock_irq(&q->queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) if (!icq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) if (!icq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) get_io_context(icq->ioc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) rq->elv.icq = icq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * Mark a hardware queue as needing a restart. For shared queues, maintain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * a count of how many hardware queues are marked for restart.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * in blk_mq_run_hw_queue(). Its pair is the barrier in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * meantime new request added to hctx->dispatch is missed to check in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * blk_mq_run_hw_queue().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) smp_mb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) blk_mq_run_hw_queue(hctx, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) struct request *rqa = container_of(a, struct request, queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) struct request *rqb = container_of(b, struct request, queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) return rqa->mq_hctx > rqb->mq_hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) struct blk_mq_hw_ctx *hctx =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) struct request *rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) LIST_HEAD(hctx_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) unsigned int count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) list_for_each_entry(rq, rq_list, queuelist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) if (rq->mq_hctx != hctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) list_cut_before(&hctx_list, rq_list, &rq->queuelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) goto dispatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) list_splice_tail_init(rq_list, &hctx_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) dispatch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) #define BLK_MQ_BUDGET_DELAY 3 /* ms units */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * its queue by itself in its completion handler, so we don't need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * be run again. This is necessary to avoid starving flushes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) struct request_queue *q = hctx->queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) struct elevator_queue *e = q->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) bool multi_hctxs = false, run_queue = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) bool dispatched = false, busy = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) unsigned int max_dispatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) LIST_HEAD(rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) int count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (hctx->dispatch_busy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) max_dispatch = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) max_dispatch = hctx->queue->nr_requests;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) struct request *rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) if (!list_empty_careful(&hctx->dispatch)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) busy = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) if (!blk_mq_get_dispatch_budget(q))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) rq = e->type->ops.dispatch_request(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) if (!rq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) blk_mq_put_dispatch_budget(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * We're releasing without dispatching. Holding the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * budget could have blocked any "hctx"s with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * same queue and if we didn't dispatch then there's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * no guarantee anyone will kick the queue. Kick it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * ourselves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) run_queue = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * Now this rq owns the budget which has to be released
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * if this rq won't be queued to driver via .queue_rq()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) * in blk_mq_dispatch_rq_list().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) list_add_tail(&rq->queuelist, &rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if (rq->mq_hctx != hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) multi_hctxs = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) } while (++count < max_dispatch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) if (!count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) if (run_queue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) } else if (multi_hctxs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * Requests from different hctx may be dequeued from some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * schedulers, such as bfq and deadline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * Sort the requests in the list according to their hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * dispatch batching requests from same hctx at a time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) list_sort(NULL, &rq_list, sched_rq_cmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) } while (!list_empty(&rq_list));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) if (busy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) return !!dispatched;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) unsigned long end = jiffies + HZ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ret = __blk_mq_do_dispatch_sched(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) if (ret != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) if (need_resched() || time_is_before_jiffies(end)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) blk_mq_delay_run_hw_queue(hctx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) } while (1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) struct blk_mq_ctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) unsigned short idx = ctx->index_hw[hctx->type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) if (++idx == hctx->nr_ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) idx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) return hctx->ctxs[idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * its queue by itself in its completion handler, so we don't need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * be run again. This is necessary to avoid starving flushes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) struct request_queue *q = hctx->queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) LIST_HEAD(rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) struct request *rq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) if (!list_empty_careful(&hctx->dispatch)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) ret = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) if (!sbitmap_any_bit_set(&hctx->ctx_map))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) if (!blk_mq_get_dispatch_budget(q))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) rq = blk_mq_dequeue_from_ctx(hctx, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) if (!rq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) blk_mq_put_dispatch_budget(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) * We're releasing without dispatching. Holding the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) * budget could have blocked any "hctx"s with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * same queue and if we didn't dispatch then there's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) * no guarantee anyone will kick the queue. Kick it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) * ourselves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * Now this rq owns the budget which has to be released
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * if this rq won't be queued to driver via .queue_rq()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * in blk_mq_dispatch_rq_list().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) list_add(&rq->queuelist, &rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) /* round robin for fair dispatch */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) WRITE_ONCE(hctx->dispatch_from, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) struct request_queue *q = hctx->queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) struct elevator_queue *e = q->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) LIST_HEAD(rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * If we have previous entries on our dispatch list, grab them first for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * more fair dispatch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) if (!list_empty_careful(&hctx->dispatch)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) spin_lock(&hctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) if (!list_empty(&hctx->dispatch))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) list_splice_init(&hctx->dispatch, &rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) spin_unlock(&hctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) * Only ask the scheduler for requests, if we didn't have residual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) * requests from the dispatch list. This is to avoid the case where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) * we only ever dispatch a fraction of the requests available because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * of low device queue depth. Once we pull requests out of the IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) * scheduler, we can no longer merge or sort them. So it's best to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) * leave them there for as long as we can. Mark the hw queue as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) * needing a restart in that case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) * We want to dispatch from the scheduler if there was nothing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) * on the dispatch list or we were able to dispatch from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) * dispatch list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) if (!list_empty(&rq_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) blk_mq_sched_mark_restart_hctx(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) if (has_sched_dispatch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) ret = blk_mq_do_dispatch_sched(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) ret = blk_mq_do_dispatch_ctx(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) } else if (has_sched_dispatch) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) ret = blk_mq_do_dispatch_sched(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) } else if (hctx->dispatch_busy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) /* dequeue request one by one from sw queue if queue is busy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) ret = blk_mq_do_dispatch_ctx(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) blk_mq_flush_busy_ctxs(hctx, &rq_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) struct request_queue *q = hctx->queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) /* RCU or SRCU read lock is needed before checking quiesced flag */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) hctx->run++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) * A return of -EAGAIN is an indication that hctx->dispatch is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * empty and we must run again in order to avoid starving flushes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) blk_mq_run_hw_queue(hctx, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) unsigned int nr_segs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) struct elevator_queue *e = q->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) struct blk_mq_ctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) struct blk_mq_hw_ctx *hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) enum hctx_type type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) if (e && e->type->ops.bio_merge)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) return e->type->ops.bio_merge(q, bio, nr_segs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) ctx = blk_mq_get_ctx(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) type = hctx->type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) list_empty_careful(&ctx->rq_lists[type]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) /* default per sw-queue merge */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) spin_lock(&ctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * Reverse check our software queue for entries that we could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * potentially merge with. Currently includes a hand-wavy stop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * count of 8, to not spend too much time checking for merges.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) ctx->rq_merged++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) spin_unlock(&ctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) void blk_mq_sched_request_inserted(struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) trace_block_rq_insert(rq->q, rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) bool has_sched,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) struct request *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) * dispatch flush and passthrough rq directly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * passthrough request has to be added to hctx->dispatch directly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * For some reason, device may be in one situation which can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) * handle FS request, so STS_RESOURCE is always returned and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) * FS request will be added to hctx->dispatch. However passthrough
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) * request may be required at that time for fixing the problem. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * passthrough request is added to scheduler queue, there isn't any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * chance to dispatch it given we prioritize requests in hctx->dispatch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) if (has_sched)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) rq->rq_flags |= RQF_SORTED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) void blk_mq_sched_insert_request(struct request *rq, bool at_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) bool run_queue, bool async)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) struct request_queue *q = rq->q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) struct elevator_queue *e = q->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) struct blk_mq_ctx *ctx = rq->mq_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * Firstly normal IO request is inserted to scheduler queue or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * sw queue, meantime we add flush request to dispatch queue(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) * hctx->dispatch) directly and there is at most one in-flight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) * flush request for each hw queue, so it doesn't matter to add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) * flush request to tail or front of the dispatch queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) * Secondly in case of NCQ, flush request belongs to non-NCQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) * command, and queueing it will fail when there is any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) * in-flight normal IO request(NCQ command). When adding flush
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) * rq to the front of hctx->dispatch, it is easier to introduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * extra time to flush rq's latency because of S_SCHED_RESTART
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) * compared with adding to the tail of dispatch queue, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) * chance of flush merge is increased, and less flush requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) * will be issued to controller. It is observed that ~10% time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) * is saved in blktests block/004 on disk attached to AHCI/NCQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) * drive when adding flush rq to the front of hctx->dispatch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * Simply queue flush rq to the front of hctx->dispatch so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * intensive flush workloads can benefit in case of NCQ HW.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) blk_mq_request_bypass_insert(rq, at_head, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) goto run;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) if (e && e->type->ops.insert_requests) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) LIST_HEAD(list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) list_add(&rq->queuelist, &list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) e->type->ops.insert_requests(hctx, &list, at_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) spin_lock(&ctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) __blk_mq_insert_request(hctx, rq, at_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) spin_unlock(&ctx->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) run:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) if (run_queue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) blk_mq_run_hw_queue(hctx, async);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) struct blk_mq_ctx *ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) struct list_head *list, bool run_queue_async)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) struct elevator_queue *e;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) struct request_queue *q = hctx->queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) * blk_mq_sched_insert_requests() is called from flush plug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) * context only, and hold one usage counter to prevent queue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) * from being released.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) percpu_ref_get(&q->q_usage_counter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) e = hctx->queue->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) if (e && e->type->ops.insert_requests)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) e->type->ops.insert_requests(hctx, list, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) * try to issue requests directly if the hw queue isn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) * busy in case of 'none' scheduler, and this way may save
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) * us one extra enqueue & dequeue to sw queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) if (!hctx->dispatch_busy && !e && !run_queue_async) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) blk_mq_try_issue_list_directly(hctx, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) if (list_empty(list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) blk_mq_insert_requests(hctx, ctx, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) blk_mq_run_hw_queue(hctx, run_queue_async);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) percpu_ref_put(&q->q_usage_counter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) unsigned int hctx_idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) if (hctx->sched_tags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) blk_mq_free_rq_map(hctx->sched_tags, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) hctx->sched_tags = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) static int blk_mq_sched_alloc_tags(struct request_queue *q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) struct blk_mq_hw_ctx *hctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) unsigned int hctx_idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) struct blk_mq_tag_set *set = q->tag_set;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) /* Clear HCTX_SHARED so tags are init'ed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) set->reserved_tags, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) if (!hctx->sched_tags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) blk_mq_sched_free_tags(set, hctx, hctx_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) /* called in queue's release handler, tagset has gone away */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) static void blk_mq_sched_tags_teardown(struct request_queue *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) struct blk_mq_hw_ctx *hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) queue_for_each_hw_ctx(q, hctx, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) /* Clear HCTX_SHARED so tags are freed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) if (hctx->sched_tags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) blk_mq_free_rq_map(hctx->sched_tags, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) hctx->sched_tags = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) struct blk_mq_hw_ctx *hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) struct elevator_queue *eq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) unsigned int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) if (!e) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) q->elevator = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) q->nr_requests = q->tag_set->queue_depth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) * Default to double of smaller one between hw queue_depth and 128,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) * since we don't split into sync/async like the old code did.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) * Additionally, this is a per-hw queue depth.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) BLKDEV_MAX_RQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) queue_for_each_hw_ctx(q, hctx, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) ret = blk_mq_sched_alloc_tags(q, hctx, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) ret = e->ops.init_sched(q, e);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) blk_mq_debugfs_register_sched(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) queue_for_each_hw_ctx(q, hctx, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (e->ops.init_hctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) ret = e->ops.init_hctx(hctx, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) eq = q->elevator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) blk_mq_sched_free_requests(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) blk_mq_exit_sched(q, eq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) kobject_put(&eq->kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) blk_mq_debugfs_register_sched_hctx(q, hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) blk_mq_sched_free_requests(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) blk_mq_sched_tags_teardown(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) q->elevator = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * called in either blk_queue_cleanup or elevator_switch, tagset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * is required for freeing requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) void blk_mq_sched_free_requests(struct request_queue *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) struct blk_mq_hw_ctx *hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) queue_for_each_hw_ctx(q, hctx, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) if (hctx->sched_tags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) struct blk_mq_hw_ctx *hctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) unsigned int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) queue_for_each_hw_ctx(q, hctx, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) blk_mq_debugfs_unregister_sched_hctx(hctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) if (e->type->ops.exit_hctx && hctx->sched_data) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) e->type->ops.exit_hctx(hctx, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) hctx->sched_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) blk_mq_debugfs_unregister_sched(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) if (e->type->ops.exit_sched)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) e->type->ops.exit_sched(e);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) blk_mq_sched_tags_teardown(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) q->elevator = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) }