Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton:
 "147 patches, based on 7d2a07b769.

  Subsystems affected by this patch series: mm (memory-hotplug, rmap,
  ioremap, highmem, cleanups, secretmem, kfence, damon, and vmscan),
  alpha, percpu, procfs, misc, core-kernel, MAINTAINERS, lib,
  checkpatch, epoll, init, nilfs2, coredump, fork, pids, criu, kconfig,
  selftests, ipc, and scripts"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (94 commits)
  scripts: check_extable: fix typo in user error message
  mm/workingset: correct kernel-doc notations
  ipc: replace costly bailout check in sysvipc_find_ipc()
  selftests/memfd: remove unused variable
  Kconfig.debug: drop selecting non-existing HARDLOCKUP_DETECTOR_ARCH
  configs: remove the obsolete CONFIG_INPUT_POLLDEV
  prctl: allow to setup brk for et_dyn executables
  pid: cleanup the stale comment mentioning pidmap_init().
  kernel/fork.c: unexport get_{mm,task}_exe_file
  coredump: fix memleak in dump_vma_snapshot()
  fs/coredump.c: log if a core dump is aborted due to changed file permissions
  nilfs2: use refcount_dec_and_lock() to fix potential UAF
  nilfs2: fix memory leak in nilfs_sysfs_delete_snapshot_group
  nilfs2: fix memory leak in nilfs_sysfs_create_snapshot_group
  nilfs2: fix memory leak in nilfs_sysfs_delete_##name##_group
  nilfs2: fix memory leak in nilfs_sysfs_create_##name##_group
  nilfs2: fix NULL pointer in nilfs_##name##_attr_release
  nilfs2: fix memory leak in nilfs_sysfs_create_device_group
  trap: cleanup trap_init()
  init: move usermodehelper_enable() to populate_rootfs()
  ...
This commit is contained in:
Linus Torvalds
2021-09-08 12:55:35 -07:00
149 changed files with 5357 additions and 946 deletions

View File

@@ -52,6 +52,73 @@ module_param(memmap_on_memory, bool, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
#endif
enum {
ONLINE_POLICY_CONTIG_ZONES = 0,
ONLINE_POLICY_AUTO_MOVABLE,
};
const char *online_policy_to_str[] = {
[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
};
static int set_online_policy(const char *val, const struct kernel_param *kp)
{
int ret = sysfs_match_string(online_policy_to_str, val);
if (ret < 0)
return ret;
*((int *)kp->arg) = ret;
return 0;
}
static int get_online_policy(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]);
}
/*
* memory_hotplug.online_policy: configure online behavior when onlining without
* specifying a zone (MMOP_ONLINE)
*
* "contig-zones": keep zone contiguous
* "auto-movable": online memory to ZONE_MOVABLE if the configuration
* (auto_movable_ratio, auto_movable_numa_aware) allows for it
*/
static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
static const struct kernel_param_ops online_policy_ops = {
.set = set_online_policy,
.get = get_online_policy,
};
module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
MODULE_PARM_DESC(online_policy,
"Set the online policy (\"contig-zones\", \"auto-movable\") "
"Default: \"contig-zones\"");
/*
* memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
*
* The ratio represent an upper limit and the kernel might decide to not
* online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
* doesn't allow for more MOVABLE memory.
*/
static unsigned int auto_movable_ratio __read_mostly = 301;
module_param(auto_movable_ratio, uint, 0644);
MODULE_PARM_DESC(auto_movable_ratio,
"Set the maximum ratio of MOVABLE:KERNEL memory in the system "
"in percent for \"auto-movable\" online policy. Default: 301");
/*
* memory_hotplug.auto_movable_numa_aware: consider numa node stats
*/
#ifdef CONFIG_NUMA
static bool auto_movable_numa_aware __read_mostly = true;
module_param(auto_movable_numa_aware, bool, 0644);
MODULE_PARM_DESC(auto_movable_numa_aware,
"Consider numa node stats in addition to global stats in "
"\"auto-movable\" online policy. Default: true");
#endif /* CONFIG_NUMA */
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -410,15 +477,13 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
sizeof(struct page) * cur_nr_pages);
}
#ifdef CONFIG_ZONE_DEVICE
/*
* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
* we will not try to shrink the zones - which is okay as
* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
*/
if (zone_idx(zone) == ZONE_DEVICE)
if (zone_is_zone_device(zone))
return;
#endif
clear_zone_contiguous(zone);
@@ -663,6 +728,109 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
set_zone_contiguous(zone);
}
struct auto_movable_stats {
unsigned long kernel_early_pages;
unsigned long movable_pages;
};
static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
struct zone *zone)
{
if (zone_idx(zone) == ZONE_MOVABLE) {
stats->movable_pages += zone->present_pages;
} else {
stats->kernel_early_pages += zone->present_early_pages;
#ifdef CONFIG_CMA
/*
* CMA pages (never on hotplugged memory) behave like
* ZONE_MOVABLE.
*/
stats->movable_pages += zone->cma_pages;
stats->kernel_early_pages -= zone->cma_pages;
#endif /* CONFIG_CMA */
}
}
struct auto_movable_group_stats {
unsigned long movable_pages;
unsigned long req_kernel_early_pages;
};
static int auto_movable_stats_account_group(struct memory_group *group,
void *arg)
{
const int ratio = READ_ONCE(auto_movable_ratio);
struct auto_movable_group_stats *stats = arg;
long pages;
/*
* We don't support modifying the config while the auto-movable online
* policy is already enabled. Just avoid the division by zero below.
*/
if (!ratio)
return 0;
/*
* Calculate how many early kernel pages this group requires to
* satisfy the configured zone ratio.
*/
pages = group->present_movable_pages * 100 / ratio;
pages -= group->present_kernel_pages;
if (pages > 0)
stats->req_kernel_early_pages += pages;
stats->movable_pages += group->present_movable_pages;
return 0;
}
static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
unsigned long nr_pages)
{
unsigned long kernel_early_pages, movable_pages;
struct auto_movable_group_stats group_stats = {};
struct auto_movable_stats stats = {};
pg_data_t *pgdat = NODE_DATA(nid);
struct zone *zone;
int i;
/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
if (nid == NUMA_NO_NODE) {
/* TODO: cache values */
for_each_populated_zone(zone)
auto_movable_stats_account_zone(&stats, zone);
} else {
for (i = 0; i < MAX_NR_ZONES; i++) {
zone = pgdat->node_zones + i;
if (populated_zone(zone))
auto_movable_stats_account_zone(&stats, zone);
}
}
kernel_early_pages = stats.kernel_early_pages;
movable_pages = stats.movable_pages;
/*
* Kernel memory inside dynamic memory group allows for more MOVABLE
* memory within the same group. Remove the effect of all but the
* current group from the stats.
*/
walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
group, &group_stats);
if (kernel_early_pages <= group_stats.req_kernel_early_pages)
return false;
kernel_early_pages -= group_stats.req_kernel_early_pages;
movable_pages -= group_stats.movable_pages;
if (group && group->is_dynamic)
kernel_early_pages += group->present_kernel_pages;
/*
* Test if we could online the given number of pages to ZONE_MOVABLE
* and still stay in the configured ratio.
*/
movable_pages += nr_pages;
return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
}
/*
* Returns a default kernel memory zone for the given pfn range.
* If no kernel zone covers this pfn range it will automatically go
@@ -684,6 +852,117 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
return &pgdat->node_zones[ZONE_NORMAL];
}
/*
* Determine to which zone to online memory dynamically based on user
* configuration and system stats. We care about the following ratio:
*
* MOVABLE : KERNEL
*
* Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
* one of the kernel zones. CMA pages inside one of the kernel zones really
* behaves like ZONE_MOVABLE, so we treat them accordingly.
*
* We don't allow for hotplugged memory in a KERNEL zone to increase the
* amount of MOVABLE memory we can have, so we end up with:
*
* MOVABLE : KERNEL_EARLY
*
* Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
* boot. We base our calculation on KERNEL_EARLY internally, because:
*
* a) Hotplugged memory in one of the kernel zones can sometimes still get
* hotunplugged, especially when hot(un)plugging individual memory blocks.
* There is no coordination across memory devices, therefore "automatic"
* hotunplugging, as implemented in hypervisors, could result in zone
* imbalances.
* b) Early/boot memory in one of the kernel zones can usually not get
* hotunplugged again (e.g., no firmware interface to unplug, fragmented
* with unmovable allocations). While there are corner cases where it might
* still work, it is barely relevant in practice.
*
* Exceptions are dynamic memory groups, which allow for more MOVABLE
* memory within the same memory group -- because in that case, there is
* coordination within the single memory device managed by a single driver.
*
* We rely on "present pages" instead of "managed pages", as the latter is
* highly unreliable and dynamic in virtualized environments, and does not
* consider boot time allocations. For example, memory ballooning adjusts the
* managed pages when inflating/deflating the balloon, and balloon compaction
* can even migrate inflated pages between zones.
*
* Using "present pages" is better but some things to keep in mind are:
*
* a) Some memblock allocations, such as for the crashkernel area, are
* effectively unused by the kernel, yet they account to "present pages".
* Fortunately, these allocations are comparatively small in relevant setups
* (e.g., fraction of system memory).
* b) Some hotplugged memory blocks in virtualized environments, esecially
* hotplugged by virtio-mem, look like they are completely present, however,
* only parts of the memory block are actually currently usable.
* "present pages" is an upper limit that can get reached at runtime. As
* we base our calculations on KERNEL_EARLY, this is not an issue.
*/
static struct zone *auto_movable_zone_for_pfn(int nid,
struct memory_group *group,
unsigned long pfn,
unsigned long nr_pages)
{
unsigned long online_pages = 0, max_pages, end_pfn;
struct page *page;
if (!auto_movable_ratio)
goto kernel_zone;
if (group && !group->is_dynamic) {
max_pages = group->s.max_pages;
online_pages = group->present_movable_pages;
/* If anything is !MOVABLE online the rest !MOVABLE. */
if (group->present_kernel_pages)
goto kernel_zone;
} else if (!group || group->d.unit_pages == nr_pages) {
max_pages = nr_pages;
} else {
max_pages = group->d.unit_pages;
/*
* Take a look at all online sections in the current unit.
* We can safely assume that all pages within a section belong
* to the same zone, because dynamic memory groups only deal
* with hotplugged memory.
*/
pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
end_pfn = pfn + group->d.unit_pages;
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn);
if (!page)
continue;
/* If anything is !MOVABLE online the rest !MOVABLE. */
if (page_zonenum(page) != ZONE_MOVABLE)
goto kernel_zone;
online_pages += PAGES_PER_SECTION;
}
}
/*
* Online MOVABLE if we could *currently* online all remaining parts
* MOVABLE. We expect to (add+) online them immediately next, so if
* nobody interferes, all will be MOVABLE if possible.
*/
nr_pages = max_pages - online_pages;
if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
goto kernel_zone;
#ifdef CONFIG_NUMA
if (auto_movable_numa_aware &&
!auto_movable_can_online_movable(nid, group, nr_pages))
goto kernel_zone;
#endif /* CONFIG_NUMA */
return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
kernel_zone:
return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
}
static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -708,7 +987,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
struct zone *zone_for_pfn_range(int online_type, int nid,
struct memory_group *group, unsigned long start_pfn,
unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
@@ -717,6 +997,9 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
if (online_type == MMOP_ONLINE_MOVABLE)
return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
@@ -724,10 +1007,25 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
* This function should only be called by memory_block_{online,offline},
* and {online,offline}_pages.
*/
void adjust_present_page_count(struct zone *zone, long nr_pages)
void adjust_present_page_count(struct page *page, struct memory_group *group,
long nr_pages)
{
struct zone *zone = page_zone(page);
const bool movable = zone_idx(zone) == ZONE_MOVABLE;
/*
* We only support onlining/offlining/adding/removing of complete
* memory blocks; therefore, either all is either early or hotplugged.
*/
if (early_section(__pfn_to_section(page_to_pfn(page))))
zone->present_early_pages += nr_pages;
zone->present_pages += nr_pages;
zone->zone_pgdat->node_present_pages += nr_pages;
if (group && movable)
group->present_movable_pages += nr_pages;
else if (group && !movable)
group->present_kernel_pages += nr_pages;
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
@@ -773,7 +1071,8 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
}
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
unsigned long flags;
int need_zonelists_rebuild = 0;
@@ -826,7 +1125,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z
}
online_pages_range(pfn, nr_pages);
adjust_present_page_count(zone, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
@@ -1059,6 +1358,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
struct vmem_altmap mhp_altmap = {};
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
int ret;
@@ -1070,6 +1370,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
if (ret)
return ret;
if (mhp_flags & MHP_NID_IS_MGID) {
group = memory_group_find_by_id(nid);
if (!group)
return -EINVAL;
nid = group->nid;
}
if (!node_possible(nid)) {
WARN(1, "node %d was absent from the node_possible_map\n", nid);
return -EINVAL;
@@ -1104,9 +1411,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
group);
if (ret) {
arch_remove_memory(nid, start, size, NULL);
arch_remove_memory(start, size, NULL);
goto error;
}
@@ -1298,7 +1606,7 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long pfn, sec_end_pfn;
struct zone *zone = NULL;
struct page *page;
int i;
for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
pfn < end_pfn;
pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
@@ -1307,17 +1615,10 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
continue;
for (; pfn < sec_end_pfn && pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
i = 0;
/* This is just a CONFIG_HOLES_IN_ZONE check.*/
while ((i < MAX_ORDER_NR_PAGES) &&
!pfn_valid_within(pfn + i))
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
if (zone && !zone_spans_pfn(zone, pfn))
return NULL;
page = pfn_to_page(pfn + i);
page = pfn_to_page(pfn);
if (zone && page_zone(page) != zone)
return NULL;
zone = page_zone(page);
@@ -1568,7 +1869,8 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct memory_group *group)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, system_ram_pages = 0;
@@ -1704,7 +2006,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
adjust_present_page_count(zone, -nr_pages);
adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
/* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
@@ -1746,7 +2048,9 @@ failed_removal:
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
int *nid = arg;
*nid = mem->nid;
if (unlikely(ret)) {
phys_addr_t beginpa, endpa;
@@ -1839,12 +2143,12 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
static int __ref try_remove_memory(u64 start, u64 size)
{
int rc = 0;
struct vmem_altmap mhp_altmap = {};
struct vmem_altmap *altmap = NULL;
unsigned long nr_vmemmap_pages;
int rc = 0, nid = NUMA_NO_NODE;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1852,8 +2156,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
* if this is not the case.
*
* While at it, determine the nid. Note that if we'd have mixed nodes,
* we'd only try to offline the last determined one -- which is good
* enough for the cases we care about.
*/
rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
if (rc)
return rc;
@@ -1893,7 +2201,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
arch_remove_memory(nid, start, size, altmap);
arch_remove_memory(start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
@@ -1902,7 +2210,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
release_mem_region_adjustable(start, size);
try_offline_node(nid);
if (nid != NUMA_NO_NODE)
try_offline_node(nid);
mem_hotplug_done();
return 0;
@@ -1910,7 +2219,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/**
* __remove_memory - Remove memory if every memory block is offline
* @nid: the node ID
* @start: physical address of the region to remove
* @size: size of the region to remove
*
@@ -1918,14 +2226,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
* and online/offline operations before this call, as required by
* try_offline_node().
*/
void __remove_memory(int nid, u64 start, u64 size)
void __remove_memory(u64 start, u64 size)
{
/*
* trigger BUG() if some memory is not offlined prior to calling this
* function
*/
if (try_remove_memory(nid, start, size))
if (try_remove_memory(start, size))
BUG();
}
@@ -1933,12 +2241,12 @@ void __remove_memory(int nid, u64 start, u64 size)
* Remove memory if every memory block is offline, otherwise return -EBUSY is
* some memory is not offline
*/
int remove_memory(int nid, u64 start, u64 size)
int remove_memory(u64 start, u64 size)
{
int rc;
lock_device_hotplug();
rc = try_remove_memory(nid, start, size);
rc = try_remove_memory(start, size);
unlock_device_hotplug();
return rc;
@@ -1998,7 +2306,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg)
* unplugged all memory (so it's no longer in use) and want to offline + remove
* that memory.
*/
int offline_and_remove_memory(int nid, u64 start, u64 size)
int offline_and_remove_memory(u64 start, u64 size)
{
const unsigned long mb_count = size / memory_block_size_bytes();
uint8_t *online_types, *tmp;
@@ -2034,7 +2342,7 @@ int offline_and_remove_memory(int nid, u64 start, u64 size)
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
rc = try_remove_memory(nid, start, size);
rc = try_remove_memory(start, size);
if (rc)
pr_err("%s: Failed to remove memory: %d", __func__, rc);
}