Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The changes in this cycle are:

   - Optimize the task wakeup CPU selection logic, to improve
     scalability and reduce wakeup latency spikes

   - PELT enhancements

   - CFS bandwidth handling fixes

   - Optimize the wakeup path by remove rq->wake_list and replacing it
     with ->ttwu_pending

   - Optimize IPI cross-calls by making flush_smp_call_function_queue()
     process sync callbacks first.

   - Misc fixes and enhancements"

* tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too
  sched/headers: Split out open-coded prototypes into kernel/sched/smp.h
  sched: Replace rq::wake_list
  sched: Add rq::ttwu_pending
  irq_work, smp: Allow irq_work on call_single_queue
  smp: Optimize send_call_function_single_ipi()
  smp: Move irq_work_run() out of flush_smp_call_function_queue()
  smp: Optimize flush_smp_call_function_queue()
  sched: Fix smp_call_function_single_async() usage for ILB
  sched/core: Offload wakee task activation if it the wakee is descheduling
  sched/core: Optimize ttwu() spinning on p->on_cpu
  sched: Defend cfs and rt bandwidth quota against overflow
  sched/cpuacct: Fix charge cpuacct.usage_sys
  sched/fair: Replace zero-length array with flexible-array
  sched/pelt: Sync util/runnable_sum with PELT window when propagating
  sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr()
  sched/fair: Optimize enqueue_task_fair()
  sched: Make scheduler_ipi inline
  sched: Clean up scheduler_ipi()
  sched/core: Simplify sched_init()
  ...
This commit is contained in:
Linus Torvalds
2020-06-03 13:06:42 -07:00
21 changed files with 609 additions and 414 deletions

View File

@@ -22,11 +22,9 @@
#include <linux/hypervisor.h>
#include "smpboot.h"
#include "sched/smp.h"
enum {
CSD_FLAG_LOCK = 0x01,
CSD_FLAG_SYNCHRONOUS = 0x02,
};
#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* still pending.
*/
flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
if (llist_add(node, &per_cpu(call_single_queue, cpu)))
send_call_function_single_ipi(cpu);
}
/*
* Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
static int generic_exec_single(int cpu, call_single_data_t *csd,
smp_call_func_t func, void *info)
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
void *info = csd->info;
unsigned long flags;
/*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
return 0;
}
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd);
return -ENXIO;
}
csd->func = func;
csd->info = info;
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
arch_send_call_function_single_ipi(cpu);
__smp_call_single_queue(cpu, &csd->llist);
return 0;
}
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned;
lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
llist_for_each_entry(csd, entry, llist)
pr_warn("IPI callback %pS sent to offline CPU\n",
csd->func);
}
llist_for_each_entry(csd, entry, llist) {
switch (CSD_TYPE(csd)) {
case CSD_TYPE_ASYNC:
case CSD_TYPE_SYNC:
case CSD_TYPE_IRQ_WORK:
pr_warn("IPI callback %pS sent to offline CPU\n",
csd->func);
break;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
smp_call_func_t func = csd->func;
void *info = csd->info;
case CSD_TYPE_TTWU:
pr_warn("IPI task-wakeup sent to offline CPU\n");
break;
/* Do we wait until *after* callback? */
if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
func(info);
csd_unlock(csd);
} else {
csd_unlock(csd);
func(info);
default:
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
CSD_TYPE(csd));
break;
}
}
}
/*
* Handle irq works queued remotely by irq_work_queue_on().
* Smp functions above are typically synchronous so they
* better run first since some other CPUs may be busy waiting
* for them.
* First; run all SYNC callbacks, people are waiting for us.
*/
irq_work_run();
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
/* Do we wait until *after* callback? */
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
if (prev) {
prev->next = &csd_next->llist;
} else {
entry = &csd_next->llist;
}
func(info);
csd_unlock(csd);
} else {
prev = &csd->llist;
}
}
if (!entry)
return;
/*
* Second; run all !SYNC callbacks.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
int type = CSD_TYPE(csd);
if (type != CSD_TYPE_TTWU) {
if (prev) {
prev->next = &csd_next->llist;
} else {
entry = &csd_next->llist;
}
if (type == CSD_TYPE_ASYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
csd_unlock(csd);
func(info);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
}
} else {
prev = &csd->llist;
}
}
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
if (entry)
sched_ttwu_pending(entry);
}
void flush_smp_call_function_from_idle(void)
{
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
local_irq_save(flags);
flush_smp_call_function_queue(true);
local_irq_restore(flags);
}
/*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
};
int this_cpu;
int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd);
}
err = generic_exec_single(cpu, csd, func, info);
csd->func = func;
csd->info = info;
err = generic_exec_single(cpu, csd);
if (wait)
csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
csd->flags = CSD_FLAG_LOCK;
smp_wmb();
err = generic_exec_single(cpu, csd, csd->func, csd->info);
err = generic_exec_single(cpu, csd);
out:
preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd);
if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->flags |= CSD_TYPE_SYNC;
csd->func = func;
csd->info = info;
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@ -598,6 +669,24 @@ void __init smp_init(void)
{
int num_nodes, num_cpus;
/*
* Ensure struct irq_work layout matches so that
* flush_smp_call_function_queue() can do horrible things.
*/
BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
offsetof(struct __call_single_data, llist));
BUILD_BUG_ON(offsetof(struct irq_work, func) !=
offsetof(struct __call_single_data, func));
BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
offsetof(struct __call_single_data, flags));
/*
* Assert the CSD_TYPE_TTWU layout is similar enough
* for task_struct to be on the @call_single_queue.
*/
BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
idle_threads_init();
cpuhp_threads_init();