mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 04:33:26 +02:00
In commit f689054aac
("percpu_counter: add percpu_counter_sum_all
interface") a race condition between a cpu dying and
percpu_counter_sum() iterating online CPUs was identified. The
solution was to iterate all possible CPUs for summation via
percpu_counter_sum_all().
We recently had a percpu_counter_sum() call in XFS trip over this
same race condition and it fired a debug assert because the
filesystem was unmounting and the counter *should* be zero just
before we destroy it. That was reported here:
https://lore.kernel.org/linux-kernel/20230314090649.326642-1-yebin@huaweicloud.com/
likely as a result of running generic/648 which exercises
filesystems in the presence of CPU online/offline events.
The solution to use percpu_counter_sum_all() is an awful one. We
use percpu counters and percpu_counter_sum() for accurate and
reliable threshold detection for space management, so a summation
race condition during these operations can result in overcommit of
available space and that may result in filesystem shutdowns.
As percpu_counter_sum_all() iterates all possible CPUs rather than
just those online or even those present, the mask can include CPUs
that aren't even installed in the machine, or in the case of
machines that can hot-plug CPU capable nodes, even have physical
sockets present in the machine.
Fundamentally, this race condition is caused by the CPU being
offlined being removed from the cpu_online_mask before the notifier
that cleans up per-cpu state is run. Hence percpu_counter_sum() will
not sum the count for a cpu currently being taken offline,
regardless of whether the notifier has run or not. This is
the root cause of the bug.
The percpu counter notifier iterates all the registered counters,
locks the counter and moves the percpu count to the global sum.
This is serialised against other operations that move the percpu
counter to the global sum as well as percpu_counter_sum() operations
that sum the percpu counts while holding the counter lock.
Hence the notifier is safe to run concurrently with sum operations,
and the only thing we actually need to care about is that
percpu_counter_sum() iterates dying CPUs. That's trivial to do,
and when there are no CPUs dying, it has no addition overhead except
for a cpumask_or() operation.
This change makes percpu_counter_sum() always do the right thing in
the presence of CPU hot unplug events and makes
percpu_counter_sum_all() unnecessary. This, in turn, means that
filesystems like XFS, ext4, and btrfs don't have to work out when
they should use percpu_counter_sum() vs percpu_counter_sum_all() in
their space accounting algorithms
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
289 lines
7.7 KiB
C
289 lines
7.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Fast batching percpu counters.
|
|
*/
|
|
|
|
#include <linux/percpu_counter.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/init.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/module.h>
|
|
#include <linux/debugobjects.h>
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
static LIST_HEAD(percpu_counters);
|
|
static DEFINE_SPINLOCK(percpu_counters_lock);
|
|
#endif
|
|
|
|
#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
|
|
|
|
static const struct debug_obj_descr percpu_counter_debug_descr;
|
|
|
|
static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
|
|
{
|
|
struct percpu_counter *fbc = addr;
|
|
|
|
switch (state) {
|
|
case ODEBUG_STATE_ACTIVE:
|
|
percpu_counter_destroy(fbc);
|
|
debug_object_free(fbc, &percpu_counter_debug_descr);
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static const struct debug_obj_descr percpu_counter_debug_descr = {
|
|
.name = "percpu_counter",
|
|
.fixup_free = percpu_counter_fixup_free,
|
|
};
|
|
|
|
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
|
|
{
|
|
debug_object_init(fbc, &percpu_counter_debug_descr);
|
|
debug_object_activate(fbc, &percpu_counter_debug_descr);
|
|
}
|
|
|
|
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
|
|
{
|
|
debug_object_deactivate(fbc, &percpu_counter_debug_descr);
|
|
debug_object_free(fbc, &percpu_counter_debug_descr);
|
|
}
|
|
|
|
#else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
|
|
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
|
|
{ }
|
|
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
|
|
{ }
|
|
#endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
|
|
|
|
void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
|
|
{
|
|
int cpu;
|
|
unsigned long flags;
|
|
|
|
raw_spin_lock_irqsave(&fbc->lock, flags);
|
|
for_each_possible_cpu(cpu) {
|
|
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
|
|
*pcount = 0;
|
|
}
|
|
fbc->count = amount;
|
|
raw_spin_unlock_irqrestore(&fbc->lock, flags);
|
|
}
|
|
EXPORT_SYMBOL(percpu_counter_set);
|
|
|
|
/*
|
|
* local_irq_save() is needed to make the function irq safe:
|
|
* - The slow path would be ok as protected by an irq-safe spinlock.
|
|
* - this_cpu_add would be ok as it is irq-safe by definition.
|
|
* But:
|
|
* The decision slow path/fast path and the actual update must be atomic, too.
|
|
* Otherwise a call in process context could check the current values and
|
|
* decide that the fast path can be used. If now an interrupt occurs before
|
|
* the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
|
|
* then the this_cpu_add() that is executed after the interrupt has completed
|
|
* can produce values larger than "batch" or even overflows.
|
|
*/
|
|
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
|
|
{
|
|
s64 count;
|
|
unsigned long flags;
|
|
|
|
local_irq_save(flags);
|
|
count = __this_cpu_read(*fbc->counters) + amount;
|
|
if (abs(count) >= batch) {
|
|
raw_spin_lock(&fbc->lock);
|
|
fbc->count += count;
|
|
__this_cpu_sub(*fbc->counters, count - amount);
|
|
raw_spin_unlock(&fbc->lock);
|
|
} else {
|
|
this_cpu_add(*fbc->counters, amount);
|
|
}
|
|
local_irq_restore(flags);
|
|
}
|
|
EXPORT_SYMBOL(percpu_counter_add_batch);
|
|
|
|
/*
|
|
* For percpu_counter with a big batch, the devication of its count could
|
|
* be big, and there is requirement to reduce the deviation, like when the
|
|
* counter's batch could be runtime decreased to get a better accuracy,
|
|
* which can be achieved by running this sync function on each CPU.
|
|
*/
|
|
void percpu_counter_sync(struct percpu_counter *fbc)
|
|
{
|
|
unsigned long flags;
|
|
s64 count;
|
|
|
|
raw_spin_lock_irqsave(&fbc->lock, flags);
|
|
count = __this_cpu_read(*fbc->counters);
|
|
fbc->count += count;
|
|
__this_cpu_sub(*fbc->counters, count);
|
|
raw_spin_unlock_irqrestore(&fbc->lock, flags);
|
|
}
|
|
EXPORT_SYMBOL(percpu_counter_sync);
|
|
|
|
static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc,
|
|
const struct cpumask *cpu_mask)
|
|
{
|
|
s64 ret;
|
|
int cpu;
|
|
unsigned long flags;
|
|
|
|
raw_spin_lock_irqsave(&fbc->lock, flags);
|
|
ret = fbc->count;
|
|
for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) {
|
|
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
|
|
ret += *pcount;
|
|
}
|
|
raw_spin_unlock_irqrestore(&fbc->lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Add up all the per-cpu counts, return the result. This is a more accurate
|
|
* but much slower version of percpu_counter_read_positive().
|
|
*
|
|
* We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
|
|
* from CPUs that are in the process of being taken offline. Dying cpus have
|
|
* been removed from the online mask, but may not have had the hotplug dead
|
|
* notifier called to fold the percpu count back into the global counter sum.
|
|
* By including dying CPUs in the iteration mask, we avoid this race condition
|
|
* so __percpu_counter_sum() just does the right thing when CPUs are being taken
|
|
* offline.
|
|
*/
|
|
s64 __percpu_counter_sum(struct percpu_counter *fbc)
|
|
{
|
|
|
|
return __percpu_counter_sum_mask(fbc, cpu_dying_mask);
|
|
}
|
|
EXPORT_SYMBOL(__percpu_counter_sum);
|
|
|
|
/*
|
|
* This is slower version of percpu_counter_sum as it traverses all possible
|
|
* cpus. Use this only in the cases where accurate data is needed in the
|
|
* presense of CPUs getting offlined.
|
|
*/
|
|
s64 percpu_counter_sum_all(struct percpu_counter *fbc)
|
|
{
|
|
return __percpu_counter_sum_mask(fbc, cpu_possible_mask);
|
|
}
|
|
EXPORT_SYMBOL(percpu_counter_sum_all);
|
|
|
|
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
|
|
struct lock_class_key *key)
|
|
{
|
|
unsigned long flags __maybe_unused;
|
|
|
|
raw_spin_lock_init(&fbc->lock);
|
|
lockdep_set_class(&fbc->lock, key);
|
|
fbc->count = amount;
|
|
fbc->counters = alloc_percpu_gfp(s32, gfp);
|
|
if (!fbc->counters)
|
|
return -ENOMEM;
|
|
|
|
debug_percpu_counter_activate(fbc);
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
INIT_LIST_HEAD(&fbc->list);
|
|
spin_lock_irqsave(&percpu_counters_lock, flags);
|
|
list_add(&fbc->list, &percpu_counters);
|
|
spin_unlock_irqrestore(&percpu_counters_lock, flags);
|
|
#endif
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(__percpu_counter_init);
|
|
|
|
void percpu_counter_destroy(struct percpu_counter *fbc)
|
|
{
|
|
unsigned long flags __maybe_unused;
|
|
|
|
if (!fbc->counters)
|
|
return;
|
|
|
|
debug_percpu_counter_deactivate(fbc);
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
spin_lock_irqsave(&percpu_counters_lock, flags);
|
|
list_del(&fbc->list);
|
|
spin_unlock_irqrestore(&percpu_counters_lock, flags);
|
|
#endif
|
|
free_percpu(fbc->counters);
|
|
fbc->counters = NULL;
|
|
}
|
|
EXPORT_SYMBOL(percpu_counter_destroy);
|
|
|
|
int percpu_counter_batch __read_mostly = 32;
|
|
EXPORT_SYMBOL(percpu_counter_batch);
|
|
|
|
static int compute_batch_value(unsigned int cpu)
|
|
{
|
|
int nr = num_online_cpus();
|
|
|
|
percpu_counter_batch = max(32, nr*2);
|
|
return 0;
|
|
}
|
|
|
|
static int percpu_counter_cpu_dead(unsigned int cpu)
|
|
{
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
struct percpu_counter *fbc;
|
|
|
|
compute_batch_value(cpu);
|
|
|
|
spin_lock_irq(&percpu_counters_lock);
|
|
list_for_each_entry(fbc, &percpu_counters, list) {
|
|
s32 *pcount;
|
|
|
|
raw_spin_lock(&fbc->lock);
|
|
pcount = per_cpu_ptr(fbc->counters, cpu);
|
|
fbc->count += *pcount;
|
|
*pcount = 0;
|
|
raw_spin_unlock(&fbc->lock);
|
|
}
|
|
spin_unlock_irq(&percpu_counters_lock);
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Compare counter against given value.
|
|
* Return 1 if greater, 0 if equal and -1 if less
|
|
*/
|
|
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
|
|
{
|
|
s64 count;
|
|
|
|
count = percpu_counter_read(fbc);
|
|
/* Check to see if rough count will be sufficient for comparison */
|
|
if (abs(count - rhs) > (batch * num_online_cpus())) {
|
|
if (count > rhs)
|
|
return 1;
|
|
else
|
|
return -1;
|
|
}
|
|
/* Need to use precise count */
|
|
count = percpu_counter_sum(fbc);
|
|
if (count > rhs)
|
|
return 1;
|
|
else if (count < rhs)
|
|
return -1;
|
|
else
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(__percpu_counter_compare);
|
|
|
|
static int __init percpu_counter_startup(void)
|
|
{
|
|
int ret;
|
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
|
|
compute_batch_value, NULL);
|
|
WARN_ON(ret < 0);
|
|
ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
|
|
"lib/percpu_cnt:dead", NULL,
|
|
percpu_counter_cpu_dead);
|
|
WARN_ON(ret < 0);
|
|
return 0;
|
|
}
|
|
module_init(percpu_counter_startup);
|