Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull mm updates from Andrew Morton:

 - Yosry Ahmed brought back some cgroup v1 stats in OOM logs

 - Yosry has also eliminated cgroup's atomic rstat flushing

 - Nhat Pham adds the new cachestat() syscall. It provides userspace
   with the ability to query pagecache status - a similar concept to
   mincore() but more powerful and with improved usability

 - Mel Gorman provides more optimizations for compaction, reducing the
   prevalence of page rescanning

 - Lorenzo Stoakes has done some maintanance work on the
   get_user_pages() interface

 - Liam Howlett continues with cleanups and maintenance work to the
   maple tree code. Peng Zhang also does some work on maple tree

 - Johannes Weiner has done some cleanup work on the compaction code

 - David Hildenbrand has contributed additional selftests for
   get_user_pages()

 - Thomas Gleixner has contributed some maintenance and optimization
   work for the vmalloc code

 - Baolin Wang has provided some compaction cleanups,

 - SeongJae Park continues maintenance work on the DAMON code

 - Huang Ying has done some maintenance on the swap code's usage of
   device refcounting

 - Christoph Hellwig has some cleanups for the filemap/directio code

 - Ryan Roberts provides two patch series which yield some
   rationalization of the kernel's access to pte entries - use the
   provided APIs rather than open-coding accesses

 - Lorenzo Stoakes has some fixes to the interaction between pagecache
   and directio access to file mappings

 - John Hubbard has a series of fixes to the MM selftesting code

 - ZhangPeng continues the folio conversion campaign

 - Hugh Dickins has been working on the pagetable handling code, mainly
   with a view to reducing the load on the mmap_lock

 - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
   from 128 to 8

 - Domenico Cerasuolo has improved the zswap reclaim mechanism by
   reorganizing the LRU management

 - Matthew Wilcox provides some fixups to make gfs2 work better with the
   buffer_head code

 - Vishal Moola also has done some folio conversion work

 - Matthew Wilcox has removed the remnants of the pagevec code - their
   functionality is migrated over to struct folio_batch

* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
  mm/hugetlb: remove hugetlb_set_page_subpool()
  mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
  hugetlb: revert use of page_cache_next_miss()
  Revert "page cache: fix page_cache_next/prev_miss off by one"
  mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
  mm: memcg: rename and document global_reclaim()
  mm: kill [add|del]_page_to_lru_list()
  mm: compaction: convert to use a folio in isolate_migratepages_block()
  mm: zswap: fix double invalidate with exclusive loads
  mm: remove unnecessary pagevec includes
  mm: remove references to pagevec
  mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
  mm: remove struct pagevec
  net: convert sunrpc from pagevec to folio_batch
  i915: convert i915_gpu_error to use a folio_batch
  pagevec: rename fbatch_count()
  mm: remove check_move_unevictable_pages()
  drm: convert drm_gem_put_pages() to use a folio_batch
  i915: convert shmem_sg_free_table() to use a folio_batch
  scatterlist: add sg_set_folio()
  ...
This commit is contained in:
Linus Torvalds
2023-06-28 10:28:11 -07:00
334 changed files with 8347 additions and 6911 deletions

View File

@@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__diag_pop();
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
@@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
spin_needbreak(&cgroup_rstat_lock))) {
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
@@ -236,25 +235,10 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp, true);
cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
/**
* cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
* @cgrp: target cgroup
*
* This function can be called from any context.
*/
void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
{
unsigned long flags;
spin_lock_irqsave(&cgroup_rstat_lock, flags);
cgroup_rstat_flush_locked(cgrp, false);
spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
}
/**
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
@@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp, true);
cgroup_rstat_flush_locked(cgrp);
}
/**

View File

@@ -24,6 +24,9 @@ config DMA_OPS_BYPASS
config ARCH_HAS_DMA_MAP_DIRECT
bool
config NEED_SG_DMA_FLAGS
bool
config NEED_SG_DMA_LENGTH
bool
@@ -87,6 +90,10 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
config DMA_BOUNCE_UNALIGNED_KMALLOC
bool
depends on SWIOTLB
config DMA_RESTRICTED_POOL
bool "DMA Restricted Pool"
depends on OF && OF_RESERVED_MEM && SWIOTLB

View File

@@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int i;
for_each_sg(sgl, sg, nents, i) {
if (sg_is_dma_bus_address(sg))
if (sg_dma_is_bus_address(sg))
sg_dma_unmark_bus_address(sg);
else
dma_direct_unmap_page(dev, sg->dma_address,

View File

@@ -94,7 +94,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
return swiotlb_map(dev, phys, size, dir, attrs);
}
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
dma_kmalloc_needs_bounce(dev, size, dir)) {
if (is_pci_p2pdma_page(page))
return DMA_MAPPING_ERROR;
if (is_swiotlb_active(dev))

View File

@@ -7490,6 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
again:
pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7498,6 +7499,9 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pmd_leaf_size(pmd);
ptep = pte_offset_map(&pmd, addr);
if (!ptep)
goto again;
pte = ptep_get_lockless(ptep);
if (pte_present(pte))
size = pte_leaf_size(pte);

View File

@@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
@@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
struct vm_area_struct *vma;
int ret;
short *ptr;
@@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
FOLL_WRITE, &page, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
@@ -474,10 +473,9 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
if (IS_ERR_OR_NULL(old_page))
return old_page ? PTR_ERR(old_page) : 0;
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
@@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
if (result < 0)
return result;

View File

@@ -52,7 +52,6 @@ static inline void register_pid_ns_sysctl_table_vm(void)
}
#else
static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {}
static inline void register_pid_ns_sysctl_table_vm(void) {}
#endif

View File

@@ -21,6 +21,33 @@
#include "power.h"
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
* they should always be called with system_transition_mutex held
* (gfp_allowed_mask also should only be modified with system_transition_mutex
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
*/
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
}
}
void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
}
unsigned int lock_system_sleep(void)
{

View File

@@ -210,6 +210,11 @@ static inline void suspend_test_finish(const char *label) {}
/* kernel/power/main.c */
extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
void pm_restrict_gfp_mask(void);
void pm_restore_gfp_mask(void);
#else
static inline void pm_restrict_gfp_mask(void) {}
static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM

View File

@@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone)
return 2 * rtree;
}
/*
* Touch the watchdog for every WD_PAGE_COUNT pages.
*/
#define WD_PAGE_COUNT (128*1024)
static void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
if (zone_is_empty(zone))
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
if (page_zone(page) != zone)
continue;
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
&zone->free_area[order].free_list[t], buddy_list) {
unsigned long i;
pfn = page_to_pfn(page);
for (i = 0; i < (1UL << order); i++) {
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
swsusp_set_page_free(pfn_to_page(pfn + i));
}
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#ifdef CONFIG_HIGHMEM
/**
* count_free_highmem_pages - Compute the total number of free highmem pages.

View File

@@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy);
COND_SYSCALL(migrate_pages);
COND_SYSCALL(move_pages);
COND_SYSCALL(set_mempolicy_home_node);
COND_SYSCALL(cachestat);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);

View File

@@ -2119,13 +2119,6 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
{
.procname = "lowmem_reserve_ratio",
.data = &sysctl_lowmem_reserve_ratio,
.maxlen = sizeof(sysctl_lowmem_reserve_ratio),
.mode = 0644,
.proc_handler = lowmem_reserve_ratio_sysctl_handler,
},
{
.procname = "drop_caches",
.data = &sysctl_drop_caches,
@@ -2135,39 +2128,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_FOUR,
},
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = min_free_kbytes_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_boost_factor",
.data = &watermark_boost_factor,
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,
.maxlen = sizeof(watermark_scale_factor),
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "page_lock_unfairness",
.data = &sysctl_page_lock_unfairness,
@@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "min_unmapped_ratio",
.data = &sysctl_min_unmapped_ratio,
.maxlen = sizeof(sysctl_min_unmapped_ratio),
.mode = 0644,
.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "min_slab_ratio",
.data = &sysctl_min_slab_ratio,
.maxlen = sizeof(sysctl_min_slab_ratio),
.mode = 0644,
.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
},
#endif
#ifdef CONFIG_SMP
{
@@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = {
.proc_handler = mmap_min_addr_handler,
},
#endif
#ifdef CONFIG_NUMA
{
.procname = "numa_zonelist_order",
.data = &numa_zonelist_order,
.maxlen = NUMA_ZONELIST_ORDER_LEN,
.mode = 0644,
.proc_handler = numa_zonelist_order_handler,
},
#endif
#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{

View File

@@ -498,7 +498,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
return -EBUSY;
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
&page, NULL, NULL);
&page, NULL);
if (unlikely(ret <= 0)) {
if (!fixup_fault)