Merge tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block

Pull core block updates from Jens Axboe:
 "Good amount of cleanups and tech debt removals in here, and as a
  result, the diffstat shows a nice net reduction in code.

   - Softirq completion cleanups (Christoph)

   - Stop using ->queuedata (Christoph)

   - Cleanup bd claiming (Christoph)

   - Use check_events, moving away from the legacy media change
     (Christoph)

   - Use inode i_blkbits consistently (Christoph)

   - Remove old unused writeback congestion bits (Christoph)

   - Cleanup/unify submission path (Christoph)

   - Use bio_uninit consistently, instead of bio_disassociate_blkg
     (Christoph)

   - sbitmap cleared bits handling (John)

   - Request merging blktrace event addition (Jan)

   - sysfs add/remove race fixes (Luis)

   - blk-mq tag fixes/optimizations (Ming)

   - Duplicate words in comments (Randy)

   - Flush deferral cleanup (Yufen)

   - IO context locking/retry fixes (John)

   - struct_size() usage (Gustavo)

   - blk-iocost fixes (Chengming)

   - blk-cgroup IO stats fixes (Boris)

   - Various little fixes"

* tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block: (135 commits)
  block: blk-timeout: delete duplicated word
  block: blk-mq-sched: delete duplicated word
  block: blk-mq: delete duplicated word
  block: genhd: delete duplicated words
  block: elevator: delete duplicated word and fix typos
  block: bio: delete duplicated words
  block: bfq-iosched: fix duplicated word
  iocost_monitor: start from the oldest usage index
  iocost: Fix check condition of iocg abs_vdebt
  block: Remove callback typedefs for blk_mq_ops
  block: Use non _rcu version of list functions for tag_set_list
  blk-cgroup: show global disk stats in root cgroup io.stat
  blk-cgroup: make iostat functions visible to stat printing
  block: improve discard bio alignment in __blkdev_issue_discard()
  block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers
  block: defer flush request no matter whether we have elevator
  block: make blk_timeout_init() static
  block: remove retry loop in ioc_release_fn()
  block: remove unnecessary ioc nested locking
  block: integrate bd_start_claiming into __blkdev_get
  ...
This commit is contained in:
Linus Torvalds
2020-08-03 11:57:03 -07:00
157 changed files with 1839 additions and 2648 deletions

View File

@@ -41,6 +41,8 @@
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -275,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
if (data->q->elevator) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
rq->rq_flags = rq_flags;
rq->rq_flags = 0;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
@@ -362,8 +358,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL;
/*
* Flush requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
@@ -378,7 +372,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->flags & BLK_MQ_REQ_INTERNAL))
if (!e)
blk_mq_tag_busy(data->hctx);
/*
@@ -394,7 +388,7 @@ retry:
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
* off the the CPU, and thus off the hctx that is going away.
* off the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
@@ -474,9 +468,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
data.flags |= BLK_MQ_REQ_INTERNAL;
else
if (!q->elevator)
blk_mq_tag_busy(data.hctx);
ret = -EWOULDBLOCK;
@@ -552,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
if (rq->internal_tag != BLK_MQ_NO_TAG)
blk_mq_sched_completed_request(rq, now);
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -574,71 +565,139 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
}
EXPORT_SYMBOL(blk_mq_end_request);
/*
* Softirq action handler - move entries to local list and loop over them
* while passing them to the queue registered handler.
*/
static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = this_cpu_ptr(&blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {
struct request *rq;
rq = list_entry(local_list.next, struct request, ipi_list);
list_del_init(&rq->ipi_list);
rq->q->mq_ops->complete(rq);
}
}
static void blk_mq_trigger_softirq(struct request *rq)
{
struct list_head *list;
unsigned long flags;
local_irq_save(flags);
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
/*
* If the list only contains our just added request, signal a raise of
* the softirq. If there are already entries there, someone already
* raised the irq but it hasn't run yet.
*/
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_restore(flags);
}
static int blk_softirq_cpu_dead(unsigned int cpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_done, cpu),
this_cpu_ptr(&blk_cpu_done));
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_enable();
return 0;
}
static void __blk_mq_complete_request_remote(void *data)
{
struct request *rq = data;
struct request_queue *q = rq->q;
q->mq_ops->complete(rq);
/*
* For most of single queue controllers, there is only one irq vector
* for handling I/O completion, and the only irq's affinity is set
* to all possible CPUs. On most of ARCHs, this affinity means the irq
* is handled on one specific CPU.
*
* So complete I/O requests in softirq context in case of single queue
* devices to avoid degrading I/O performance due to irqsoff latency.
*/
if (rq->q->nr_hw_queues == 1)
blk_mq_trigger_softirq(rq);
else
rq->q->mq_ops->complete(rq);
}
/**
* blk_mq_force_complete_rq() - Force complete the request, bypassing any error
* injection that could drop the completion.
* @rq: Request to be force completed
*
* Drivers should use blk_mq_complete_request() to complete requests in their
* normal IO path. For timeout error recovery, drivers may call this forced
* completion routine after they've reclaimed timed out requests to bypass
* potentially subsequent fake timeouts.
*/
void blk_mq_force_complete_rq(struct request *rq)
static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct request_queue *q = rq->q;
bool shared = false;
int cpu;
int cpu = raw_smp_processor_id();
if (!IS_ENABLED(CONFIG_SMP) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
return false;
/* same CPU or cache domain? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
return cpu_online(rq->mq_ctx->cpu);
}
bool blk_mq_complete_request_remote(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* Most of single queue controllers, there is only one irq vector
* for handling IO completion, and the only irq's affinity is set
* as all possible CPUs. On most of ARCHs, this affinity means the
* irq is handled on one specific CPU.
*
* So complete IO reqeust in softirq context in case of single queue
* for not degrading IO performance by irqsoff latency.
*/
if (q->nr_hw_queues == 1) {
__blk_complete_request(rq);
return;
}
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
if ((rq->cmd_flags & REQ_HIPRI) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
q->mq_ops->complete(rq);
return;
}
if (rq->cmd_flags & REQ_HIPRI)
return false;
cpu = get_cpu();
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
shared = cpus_share_cache(cpu, ctx->cpu);
if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
if (blk_mq_complete_need_ipi(rq)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
smp_call_function_single_async(ctx->cpu, &rq->csd);
smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
} else {
q->mq_ops->complete(rq);
if (rq->q->nr_hw_queues > 1)
return false;
blk_mq_trigger_softirq(rq);
}
put_cpu();
return true;
}
EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Complete a request by scheduling the ->complete_rq operation.
**/
void blk_mq_complete_request(struct request *rq)
{
if (!blk_mq_complete_request_remote(rq))
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
@@ -660,23 +719,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
*srcu_idx = srcu_read_lock(hctx->srcu);
}
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
bool blk_mq_complete_request(struct request *rq)
{
if (unlikely(blk_should_fake_timeout(rq->q)))
return false;
blk_mq_force_complete_rq(rq);
return true;
}
EXPORT_SYMBOL(blk_mq_complete_request);
/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
@@ -1052,6 +1094,45 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
static bool __blk_mq_get_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
bt = &rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
}
if (!hctx_may_queue(rq->mq_hctx, bt))
return false;
tag = __sbitmap_queue_get(bt);
if (tag == BLK_MQ_NO_TAG)
return false;
rq->tag = tag + tag_offset;
return true;
}
static bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
return false;
if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&hctx->nr_active);
}
hctx->tags->rqs[rq->tag] = rq;
return true;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
@@ -1204,25 +1285,70 @@ static void blk_mq_handle_zone_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
PREP_DISPATCH_NO_BUDGET,
};
static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
bool need_budget)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
blk_mq_put_driver_tag(rq);
return PREP_DISPATCH_NO_BUDGET;
}
if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
* waitqueue takes care of that. If the queue is run
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
if (!blk_mq_mark_tag_wait(hctx, rq)) {
/*
* All budgets not got from this function will be put
* together during handling partial dispatch
*/
if (need_budget)
blk_mq_put_dispatch_budget(rq->q);
return PREP_DISPATCH_NO_TAG;
}
}
return PREP_DISPATCH_OK;
}
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
unsigned int nr_budgets)
{
int i;
for (i = 0; i < nr_budgets; i++)
blk_mq_put_dispatch_budget(q);
}
/*
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget)
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
unsigned int nr_budgets)
{
struct blk_mq_hw_ctx *hctx;
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
struct request *rq, *nxt;
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
bool no_budget_avail = false;
LIST_HEAD(zone_list);
if (list_empty(list))
return false;
WARN_ON(!list_is_singular(list) && got_budget);
/*
* Now process all the entries, sending them to the driver.
*/
@@ -1232,32 +1358,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
hctx = rq->mq_hctx;
if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
blk_mq_put_driver_tag(rq);
no_budget_avail = true;
WARN_ON_ONCE(hctx != rq->mq_hctx);
prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
if (prep != PREP_DISPATCH_OK)
break;
}
if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
* waitqueue takes care of that. If the queue is run
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
if (!blk_mq_mark_tag_wait(hctx, rq)) {
blk_mq_put_dispatch_budget(hctx);
/*
* For non-shared tags, the RESTART check
* will suffice.
*/
if (hctx->flags & BLK_MQ_F_TAG_SHARED)
no_tag = true;
break;
}
}
list_del_init(&rq->queuelist);
@@ -1274,31 +1378,35 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = !blk_mq_get_driver_tag(nxt);
}
/*
* once the request is queued to lld, no need to cover the
* budget any more
*/
if (nr_budgets)
nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_mq_handle_dev_resource(rq, list);
switch (ret) {
case BLK_STS_OK:
queued++;
break;
} else if (ret == BLK_STS_ZONE_RESOURCE) {
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
if (list_empty(list))
break;
continue;
}
if (unlikely(ret != BLK_STS_OK)) {
break;
default:
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
continue;
}
queued++;
} while (!list_empty(list));
out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
@@ -1310,6 +1418,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*/
if (!list_empty(list)) {
bool needs_restart;
/* For non-shared tags, the RESTART check will suffice */
bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
(hctx->flags & BLK_MQ_F_TAG_SHARED);
bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
blk_mq_release_budgets(q, nr_budgets);
/*
* If we didn't flush the entire list, we could have told
@@ -1361,13 +1475,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
} else
blk_mq_update_dispatch_busy(hctx, false);
/*
* If the host/device is unable to accept more work, inform the
* caller of that.
*/
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
return false;
return (queued + errors) != 0;
}
@@ -1896,11 +2003,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_dispatch_budget(hctx))
if (!blk_mq_get_dispatch_budget(q))
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(hctx);
blk_mq_put_dispatch_budget(q);
goto insert;
}
@@ -2005,8 +2112,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
}
/**
* blk_mq_make_request - Create and send a request to block device.
* @q: Request queue pointer.
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
@@ -2020,8 +2126,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* Returns: Request queue cookie.
*/
blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
@@ -2035,7 +2142,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_status_t ret;
blk_queue_bounce(q, &bio);
__blk_queue_split(q, &bio, &nr_segs);
__blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
@@ -2146,7 +2253,7 @@ queue_exit:
blk_queue_exit(q);
return BLK_QC_T_NONE;
}
EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2792,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
list_del_rcu(&q->tag_set_list);
list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2819,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
}
if (set->flags & BLK_MQ_F_TAG_SHARED)
queue_set_hctx_shared(q, true);
list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
@@ -2886,7 +2993,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
{
struct request_queue *uninit_q, *q;
uninit_q = __blk_alloc_queue(set->numa_node);
uninit_q = blk_alloc_queue(set->numa_node);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
uninit_q->queuedata = queuedata;
@@ -3760,6 +3867,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
static int __init blk_mq_init(void)
{
int i;
for_each_possible_cpu(i)
INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
"block/softirq:dead", NULL,
blk_softirq_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",