mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 20:51:03 +02:00
Merge tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull remaining MM updates from Andrew Morton: "Three patch series - two that perform cleanups and one feature: - hugetlb_vmemmap cleanups from Muchun Song - hardware poisoning support for 1GB hugepages, from Naoya Horiguchi - highmem documentation fixups from Fabio De Francesco" * tag 'mm-stable-2022-08-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (23 commits) Documentation/mm: add details about kmap_local_page() and preemption highmem: delete a sentence from kmap_local_page() kdocs Documentation/mm: rrefer kmap_local_page() and avoid kmap() Documentation/mm: avoid invalid use of addresses from kmap_local_page() Documentation/mm: don't kmap*() pages which can't come from HIGHMEM highmem: specify that kmap_local_page() is callable from interrupts highmem: remove unneeded spaces in kmap_local_page() kdocs mm, hwpoison: enable memory error handling on 1GB hugepage mm, hwpoison: skip raw hwpoison page in freeing 1GB hugepage mm, hwpoison: make __page_handle_poison returns int mm, hwpoison: set PG_hwpoison for busy hugetlb pages mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage mm, hwpoison, hugetlb: support saving mechanism of raw error pages mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages() mm: hugetlb_vmemmap: use PTRS_PER_PTE instead of PMD_SIZE / PAGE_SIZE mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability mm: hugetlb_vmemmap: replace early_param() with core_param() mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to hugetlb_vmemmap.c ...
This commit is contained in:
@@ -1735,12 +1735,13 @@
|
|||||||
hugetlb_free_vmemmap=
|
hugetlb_free_vmemmap=
|
||||||
[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||||
enabled.
|
enabled.
|
||||||
|
Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
|
||||||
Allows heavy hugetlb users to free up some more
|
Allows heavy hugetlb users to free up some more
|
||||||
memory (7 * PAGE_SIZE for each 2MB hugetlb page).
|
memory (7 * PAGE_SIZE for each 2MB hugetlb page).
|
||||||
Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
|
Format: { on | off (default) }
|
||||||
|
|
||||||
[oO][Nn]/Y/y/1: enable the feature
|
on: enable HVO
|
||||||
[oO][Ff]/N/n/0: disable the feature
|
off: disable HVO
|
||||||
|
|
||||||
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
|
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
|
||||||
the default is on.
|
the default is on.
|
||||||
|
@@ -164,8 +164,8 @@ default_hugepagesz
|
|||||||
will all result in 256 2M huge pages being allocated. Valid default
|
will all result in 256 2M huge pages being allocated. Valid default
|
||||||
huge page size is architecture dependent.
|
huge page size is architecture dependent.
|
||||||
hugetlb_free_vmemmap
|
hugetlb_free_vmemmap
|
||||||
When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
|
When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB
|
||||||
unused vmemmap pages associated with each HugeTLB page.
|
Vmemmap Optimization (HVO).
|
||||||
|
|
||||||
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
|
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
|
||||||
indicates the current number of pre-allocated huge pages of the default size.
|
indicates the current number of pre-allocated huge pages of the default size.
|
||||||
|
@@ -653,8 +653,8 @@ block might fail:
|
|||||||
- Concurrent activity that operates on the same physical memory area, such as
|
- Concurrent activity that operates on the same physical memory area, such as
|
||||||
allocating gigantic pages, can result in temporary offlining failures.
|
allocating gigantic pages, can result in temporary offlining failures.
|
||||||
|
|
||||||
- Out of memory when dissolving huge pages, especially when freeing unused
|
- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap
|
||||||
vmemmap pages associated with each hugetlb page is enabled.
|
Optimization (HVO) is enabled.
|
||||||
|
|
||||||
Offlining code may be able to migrate huge page contents, but may not be able
|
Offlining code may be able to migrate huge page contents, but may not be able
|
||||||
to dissolve the source huge page because it fails allocating (unmovable) pages
|
to dissolve the source huge page because it fails allocating (unmovable) pages
|
||||||
|
@@ -569,8 +569,7 @@ This knob is not available when the size of 'struct page' (a structure defined
|
|||||||
in include/linux/mm_types.h) is not power of two (an unusual system config could
|
in include/linux/mm_types.h) is not power of two (an unusual system config could
|
||||||
result in this).
|
result in this).
|
||||||
|
|
||||||
Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
|
Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO).
|
||||||
associated with each HugeTLB page.
|
|
||||||
|
|
||||||
Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
|
Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
|
||||||
buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
|
buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
|
||||||
|
@@ -60,17 +60,40 @@ list shows them in order of preference of use.
|
|||||||
This function should be preferred, where feasible, over all the others.
|
This function should be preferred, where feasible, over all the others.
|
||||||
|
|
||||||
These mappings are thread-local and CPU-local, meaning that the mapping
|
These mappings are thread-local and CPU-local, meaning that the mapping
|
||||||
can only be accessed from within this thread and the thread is bound the
|
can only be accessed from within this thread and the thread is bound to the
|
||||||
CPU while the mapping is active. Even if the thread is preempted (since
|
CPU while the mapping is active. Although preemption is never disabled by
|
||||||
preemption is never disabled by the function) the CPU can not be
|
this function, the CPU can not be unplugged from the system via
|
||||||
unplugged from the system via CPU-hotplug until the mapping is disposed.
|
CPU-hotplug until the mapping is disposed.
|
||||||
|
|
||||||
It's valid to take pagefaults in a local kmap region, unless the context
|
It's valid to take pagefaults in a local kmap region, unless the context
|
||||||
in which the local mapping is acquired does not allow it for other reasons.
|
in which the local mapping is acquired does not allow it for other reasons.
|
||||||
|
|
||||||
|
As said, pagefaults and preemption are never disabled. There is no need to
|
||||||
|
disable preemption because, when context switches to a different task, the
|
||||||
|
maps of the outgoing task are saved and those of the incoming one are
|
||||||
|
restored.
|
||||||
|
|
||||||
kmap_local_page() always returns a valid virtual address and it is assumed
|
kmap_local_page() always returns a valid virtual address and it is assumed
|
||||||
that kunmap_local() will never fail.
|
that kunmap_local() will never fail.
|
||||||
|
|
||||||
|
On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
|
||||||
|
virtual address of the direct mapping. Only real highmem pages are
|
||||||
|
temporarily mapped. Therefore, users may call a plain page_address()
|
||||||
|
for pages which are known to not come from ZONE_HIGHMEM. However, it is
|
||||||
|
always safe to use kmap_local_page() / kunmap_local().
|
||||||
|
|
||||||
|
While it is significantly faster than kmap(), for the higmem case it
|
||||||
|
comes with restrictions about the pointers validity. Contrary to kmap()
|
||||||
|
mappings, the local mappings are only valid in the context of the caller
|
||||||
|
and cannot be handed to other contexts. This implies that users must
|
||||||
|
be absolutely sure to keep the use of the return address local to the
|
||||||
|
thread which mapped it.
|
||||||
|
|
||||||
|
Most code can be designed to use thread local mappings. User should
|
||||||
|
therefore try to design their code to avoid the use of kmap() by mapping
|
||||||
|
pages in the same thread the address will be used and prefer
|
||||||
|
kmap_local_page().
|
||||||
|
|
||||||
Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
|
Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
|
||||||
extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
|
extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
|
||||||
because the map implementation is stack based. See kmap_local_page() kdocs
|
because the map implementation is stack based. See kmap_local_page() kdocs
|
||||||
|
@@ -7,23 +7,25 @@ A vmemmap diet for HugeTLB and Device DAX
|
|||||||
HugeTLB
|
HugeTLB
|
||||||
=======
|
=======
|
||||||
|
|
||||||
The struct page structures (page structs) are used to describe a physical
|
This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
|
||||||
page frame. By default, there is a one-to-one mapping from a page frame to
|
|
||||||
it's corresponding page struct.
|
The ``struct page`` structures are used to describe a physical page frame. By
|
||||||
|
default, there is a one-to-one mapping from a page frame to it's corresponding
|
||||||
|
``struct page``.
|
||||||
|
|
||||||
HugeTLB pages consist of multiple base page size pages and is supported by many
|
HugeTLB pages consist of multiple base page size pages and is supported by many
|
||||||
architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
|
architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
|
||||||
details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
|
details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
|
||||||
currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
|
currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
|
||||||
consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
|
consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
|
||||||
For each base page, there is a corresponding page struct.
|
For each base page, there is a corresponding ``struct page``.
|
||||||
|
|
||||||
Within the HugeTLB subsystem, only the first 4 page structs are used to
|
Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to
|
||||||
contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
|
contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides
|
||||||
this upper limit. The only 'useful' information in the remaining page structs
|
this upper limit. The only 'useful' information in the remaining ``struct page``
|
||||||
is the compound_head field, and this field is the same for all tail pages.
|
is the compound_head field, and this field is the same for all tail pages.
|
||||||
|
|
||||||
By removing redundant page structs for HugeTLB pages, memory can be returned
|
By removing redundant ``struct page`` for HugeTLB pages, memory can be returned
|
||||||
to the buddy allocator for other uses.
|
to the buddy allocator for other uses.
|
||||||
|
|
||||||
Different architectures support different HugeTLB pages. For example, the
|
Different architectures support different HugeTLB pages. For example, the
|
||||||
@@ -44,7 +46,7 @@ page.
|
|||||||
| | 64KB | 2MB | 512MB | 16GB | |
|
| | 64KB | 2MB | 512MB | 16GB | |
|
||||||
+--------------+-----------+-----------+-----------+-----------+-----------+
|
+--------------+-----------+-----------+-----------+-----------+-----------+
|
||||||
|
|
||||||
When the system boot up, every HugeTLB page has more than one struct page
|
When the system boot up, every HugeTLB page has more than one ``struct page``
|
||||||
structs which size is (unit: pages)::
|
structs which size is (unit: pages)::
|
||||||
|
|
||||||
struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
|
struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
|
||||||
@@ -74,10 +76,10 @@ Where n is how many pte entries which one page can contains. So the value of
|
|||||||
n is (PAGE_SIZE / sizeof(pte_t)).
|
n is (PAGE_SIZE / sizeof(pte_t)).
|
||||||
|
|
||||||
This optimization only supports 64-bit system, so the value of sizeof(pte_t)
|
This optimization only supports 64-bit system, so the value of sizeof(pte_t)
|
||||||
is 8. And this optimization also applicable only when the size of struct page
|
is 8. And this optimization also applicable only when the size of ``struct page``
|
||||||
is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
|
is a power of two. In most cases, the size of ``struct page`` is 64 bytes (e.g.
|
||||||
x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
|
x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
|
||||||
size of struct page structs of it is 8 page frames which size depends on the
|
size of ``struct page`` structs of it is 8 page frames which size depends on the
|
||||||
size of the base page.
|
size of the base page.
|
||||||
|
|
||||||
For the HugeTLB page of the pud level mapping, then::
|
For the HugeTLB page of the pud level mapping, then::
|
||||||
@@ -86,7 +88,7 @@ For the HugeTLB page of the pud level mapping, then::
|
|||||||
= PAGE_SIZE / 8 * 8 (pages)
|
= PAGE_SIZE / 8 * 8 (pages)
|
||||||
= PAGE_SIZE (pages)
|
= PAGE_SIZE (pages)
|
||||||
|
|
||||||
Where the struct_size(pmd) is the size of the struct page structs of a
|
Where the struct_size(pmd) is the size of the ``struct page`` structs of a
|
||||||
HugeTLB page of the pmd level mapping.
|
HugeTLB page of the pmd level mapping.
|
||||||
|
|
||||||
E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
|
E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
|
||||||
@@ -94,7 +96,7 @@ HugeTLB page consists in 4096.
|
|||||||
|
|
||||||
Next, we take the pmd level mapping of the HugeTLB page as an example to
|
Next, we take the pmd level mapping of the HugeTLB page as an example to
|
||||||
show the internal implementation of this optimization. There are 8 pages
|
show the internal implementation of this optimization. There are 8 pages
|
||||||
struct page structs associated with a HugeTLB page which is pmd mapped.
|
``struct page`` structs associated with a HugeTLB page which is pmd mapped.
|
||||||
|
|
||||||
Here is how things look before optimization::
|
Here is how things look before optimization::
|
||||||
|
|
||||||
@@ -122,10 +124,10 @@ Here is how things look before optimization::
|
|||||||
+-----------+
|
+-----------+
|
||||||
|
|
||||||
The value of page->compound_head is the same for all tail pages. The first
|
The value of page->compound_head is the same for all tail pages. The first
|
||||||
page of page structs (page 0) associated with the HugeTLB page contains the 4
|
page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4
|
||||||
page structs necessary to describe the HugeTLB. The only use of the remaining
|
``struct page`` necessary to describe the HugeTLB. The only use of the remaining
|
||||||
pages of page structs (page 1 to page 7) is to point to page->compound_head.
|
pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head.
|
||||||
Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
|
Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page``
|
||||||
will be used for each HugeTLB page. This will allow us to free the remaining
|
will be used for each HugeTLB page. This will allow us to free the remaining
|
||||||
7 pages to the buddy allocator.
|
7 pages to the buddy allocator.
|
||||||
|
|
||||||
@@ -167,13 +169,37 @@ entries that can be cached in a single TLB entry.
|
|||||||
|
|
||||||
The contiguous bit is used to increase the mapping size at the pmd and pte
|
The contiguous bit is used to increase the mapping size at the pmd and pte
|
||||||
(last) level. So this type of HugeTLB page can be optimized only when its
|
(last) level. So this type of HugeTLB page can be optimized only when its
|
||||||
size of the struct page structs is greater than 1 page.
|
size of the ``struct page`` structs is greater than **1** page.
|
||||||
|
|
||||||
Notice: The head vmemmap page is not freed to the buddy allocator and all
|
Notice: The head vmemmap page is not freed to the buddy allocator and all
|
||||||
tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
|
tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
|
||||||
more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
|
more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB
|
||||||
associated with each HugeTLB page. The compound_head() can handle this
|
page) associated with each HugeTLB page. The ``compound_head()`` can handle
|
||||||
correctly (more details refer to the comment above compound_head()).
|
this correctly. There is only **one** head ``struct page``, the tail
|
||||||
|
``struct page`` with ``PG_head`` are fake head ``struct page``. We need an
|
||||||
|
approach to distinguish between those two different types of ``struct page`` so
|
||||||
|
that ``compound_head()`` can return the real head ``struct page`` when the
|
||||||
|
parameter is the tail ``struct page`` but with ``PG_head``. The following code
|
||||||
|
snippet describes how to distinguish between real and fake head ``struct page``.
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
if (test_bit(PG_head, &page->flags)) {
|
||||||
|
unsigned long head = READ_ONCE(page[1].compound_head);
|
||||||
|
|
||||||
|
if (head & 1) {
|
||||||
|
if (head == (unsigned long)page + 1)
|
||||||
|
/* head struct page */
|
||||||
|
else
|
||||||
|
/* tail struct page */
|
||||||
|
} else {
|
||||||
|
/* head struct page */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
We can safely access the field of the **page[1]** with ``PG_head`` because the
|
||||||
|
page is a compound page composed with at least two contiguous pages.
|
||||||
|
The implementation refers to ``page_fixed_fake_head()``.
|
||||||
|
|
||||||
Device DAX
|
Device DAX
|
||||||
==========
|
==========
|
||||||
@@ -187,7 +213,7 @@ PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
|
|||||||
|
|
||||||
The differences with HugeTLB are relatively minor.
|
The differences with HugeTLB are relatively minor.
|
||||||
|
|
||||||
It only use 3 page structs for storing all information as opposed
|
It only use 3 ``struct page`` for storing all information as opposed
|
||||||
to 4 on HugeTLB pages.
|
to 4 on HugeTLB pages.
|
||||||
|
|
||||||
There's no remapping of vmemmap given that device-dax memory is not part of
|
There's no remapping of vmemmap given that device-dax memory is not part of
|
||||||
|
@@ -76,17 +76,10 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
|
|||||||
void flush_dcache_page(struct page *page)
|
void flush_dcache_page(struct page *page)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Only the head page's flags of HugeTLB can be cleared since the tail
|
* HugeTLB pages are always fully mapped and only head page will be
|
||||||
* vmemmap pages associated with each HugeTLB page are mapped with
|
* set PG_dcache_clean (see comments in __sync_icache_dcache()).
|
||||||
* read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
|
|
||||||
* details can refer to vmemmap_remap_pte()). Although
|
|
||||||
* __sync_icache_dcache() only set PG_dcache_clean flag on the head
|
|
||||||
* page struct, there is more than one page struct with PG_dcache_clean
|
|
||||||
* associated with the HugeTLB page since the head vmemmap page frame
|
|
||||||
* is reused (more details can refer to the comments above
|
|
||||||
* page_fixed_fake_head()).
|
|
||||||
*/
|
*/
|
||||||
if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
|
if (PageHuge(page))
|
||||||
page = compound_head(page);
|
page = compound_head(page);
|
||||||
|
|
||||||
if (test_bit(PG_dcache_clean, &page->flags))
|
if (test_bit(PG_dcache_clean, &page->flags))
|
||||||
|
@@ -30,9 +30,15 @@ int pmd_huge(pmd_t pmd)
|
|||||||
(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
|
(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pud_huge() returns 1 if @pud is hugetlb related entry, that is normal
|
||||||
|
* hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
|
||||||
|
* Otherwise, returns 0.
|
||||||
|
*/
|
||||||
int pud_huge(pud_t pud)
|
int pud_huge(pud_t pud)
|
||||||
{
|
{
|
||||||
return !!(pud_val(pud) & _PAGE_PSE);
|
return !pud_none(pud) &&
|
||||||
|
(pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
|
12
fs/Kconfig
12
fs/Kconfig
@@ -247,8 +247,7 @@ config HUGETLB_PAGE
|
|||||||
|
|
||||||
#
|
#
|
||||||
# Select this config option from the architecture Kconfig, if it is preferred
|
# Select this config option from the architecture Kconfig, if it is preferred
|
||||||
# to enable the feature of minimizing overhead of struct page associated with
|
# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
|
||||||
# each HugeTLB page.
|
|
||||||
#
|
#
|
||||||
config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||||
bool
|
bool
|
||||||
@@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
|||||||
depends on SPARSEMEM_VMEMMAP
|
depends on SPARSEMEM_VMEMMAP
|
||||||
|
|
||||||
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
|
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
|
||||||
bool "Default optimizing vmemmap pages of HugeTLB to on"
|
bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
|
||||||
default n
|
default n
|
||||||
depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||||
help
|
help
|
||||||
When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
|
The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
|
||||||
pages associated with each HugeTLB page is default off. Say Y here
|
enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
|
||||||
to enable optimizing vmemmap pages of HugeTLB by default. It can then
|
(boot command line) or hugetlb_optimize_vmemmap (sysctl).
|
||||||
be disabled on the command line via hugetlb_free_vmemmap=off.
|
|
||||||
|
|
||||||
config MEMFD_CREATE
|
config MEMFD_CREATE
|
||||||
def_bool TMPFS || HUGETLBFS
|
def_bool TMPFS || HUGETLBFS
|
||||||
|
@@ -60,11 +60,11 @@ static inline void kmap_flush_unused(void);
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* kmap_local_page - Map a page for temporary usage
|
* kmap_local_page - Map a page for temporary usage
|
||||||
* @page: Pointer to the page to be mapped
|
* @page: Pointer to the page to be mapped
|
||||||
*
|
*
|
||||||
* Returns: The virtual address of the mapping
|
* Returns: The virtual address of the mapping
|
||||||
*
|
*
|
||||||
* Can be invoked from any context.
|
* Can be invoked from any context, including interrupts.
|
||||||
*
|
*
|
||||||
* Requires careful handling when nesting multiple mappings because the map
|
* Requires careful handling when nesting multiple mappings because the map
|
||||||
* management is stack based. The unmap has to be in the reverse order of
|
* management is stack based. The unmap has to be in the reverse order of
|
||||||
@@ -86,8 +86,7 @@ static inline void kmap_flush_unused(void);
|
|||||||
* temporarily mapped.
|
* temporarily mapped.
|
||||||
*
|
*
|
||||||
* While it is significantly faster than kmap() for the higmem case it
|
* While it is significantly faster than kmap() for the higmem case it
|
||||||
* comes with restrictions about the pointer validity. Only use when really
|
* comes with restrictions about the pointer validity.
|
||||||
* necessary.
|
|
||||||
*
|
*
|
||||||
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
|
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
|
||||||
* disabling migration in order to keep the virtual address stable across
|
* disabling migration in order to keep the virtual address stable across
|
||||||
|
@@ -42,6 +42,9 @@ enum {
|
|||||||
SUBPAGE_INDEX_CGROUP, /* reuse page->private */
|
SUBPAGE_INDEX_CGROUP, /* reuse page->private */
|
||||||
SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */
|
SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */
|
||||||
__MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
|
__MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_MEMORY_FAILURE
|
||||||
|
SUBPAGE_INDEX_HWPOISON,
|
||||||
#endif
|
#endif
|
||||||
__NR_USED_SUBPAGE,
|
__NR_USED_SUBPAGE,
|
||||||
};
|
};
|
||||||
@@ -551,7 +554,7 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|||||||
* Synchronization: Initially set after new page allocation with no
|
* Synchronization: Initially set after new page allocation with no
|
||||||
* locking. When examined and modified during migration processing
|
* locking. When examined and modified during migration processing
|
||||||
* (isolate, migrate, putback) the hugetlb_lock is held.
|
* (isolate, migrate, putback) the hugetlb_lock is held.
|
||||||
* HPG_temporary - - Set on a page that is temporarily allocated from the buddy
|
* HPG_temporary - Set on a page that is temporarily allocated from the buddy
|
||||||
* allocator. Typically used for migration target pages when no pages
|
* allocator. Typically used for migration target pages when no pages
|
||||||
* are available in the pool. The hugetlb free page path will
|
* are available in the pool. The hugetlb free page path will
|
||||||
* immediately free pages with this flag set to the buddy allocator.
|
* immediately free pages with this flag set to the buddy allocator.
|
||||||
@@ -561,6 +564,8 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|||||||
* HPG_freed - Set when page is on the free lists.
|
* HPG_freed - Set when page is on the free lists.
|
||||||
* Synchronization: hugetlb_lock held for examination and modification.
|
* Synchronization: hugetlb_lock held for examination and modification.
|
||||||
* HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
|
* HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
|
||||||
|
* HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
|
||||||
|
* that is not tracked by raw_hwp_page list.
|
||||||
*/
|
*/
|
||||||
enum hugetlb_page_flags {
|
enum hugetlb_page_flags {
|
||||||
HPG_restore_reserve = 0,
|
HPG_restore_reserve = 0,
|
||||||
@@ -568,6 +573,7 @@ enum hugetlb_page_flags {
|
|||||||
HPG_temporary,
|
HPG_temporary,
|
||||||
HPG_freed,
|
HPG_freed,
|
||||||
HPG_vmemmap_optimized,
|
HPG_vmemmap_optimized,
|
||||||
|
HPG_raw_hwp_unreliable,
|
||||||
__NR_HPAGEFLAGS,
|
__NR_HPAGEFLAGS,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -614,6 +620,7 @@ HPAGEFLAG(Migratable, migratable)
|
|||||||
HPAGEFLAG(Temporary, temporary)
|
HPAGEFLAG(Temporary, temporary)
|
||||||
HPAGEFLAG(Freed, freed)
|
HPAGEFLAG(Freed, freed)
|
||||||
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
|
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
|
||||||
|
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
|
|
||||||
@@ -638,9 +645,6 @@ struct hstate {
|
|||||||
unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
||||||
unsigned int free_huge_pages_node[MAX_NUMNODES];
|
unsigned int free_huge_pages_node[MAX_NUMNODES];
|
||||||
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
|
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
|
||||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
|
||||||
unsigned int optimize_vmemmap_pages;
|
|
||||||
#endif
|
|
||||||
#ifdef CONFIG_CGROUP_HUGETLB
|
#ifdef CONFIG_CGROUP_HUGETLB
|
||||||
/* cgroup control files */
|
/* cgroup control files */
|
||||||
struct cftype cgroup_files_dfl[8];
|
struct cftype cgroup_files_dfl[8];
|
||||||
@@ -716,7 +720,7 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
|
|||||||
return hstate_file(vma->vm_file);
|
return hstate_file(vma->vm_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned long huge_page_size(struct hstate *h)
|
static inline unsigned long huge_page_size(const struct hstate *h)
|
||||||
{
|
{
|
||||||
return (unsigned long)PAGE_SIZE << h->order;
|
return (unsigned long)PAGE_SIZE << h->order;
|
||||||
}
|
}
|
||||||
@@ -745,7 +749,7 @@ static inline bool hstate_is_gigantic(struct hstate *h)
|
|||||||
return huge_page_order(h) >= MAX_ORDER;
|
return huge_page_order(h) >= MAX_ORDER;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int pages_per_huge_page(struct hstate *h)
|
static inline unsigned int pages_per_huge_page(const struct hstate *h)
|
||||||
{
|
{
|
||||||
return 1 << h->order;
|
return 1 << h->order;
|
||||||
}
|
}
|
||||||
@@ -799,6 +803,14 @@ extern int dissolve_free_huge_page(struct page *page);
|
|||||||
extern int dissolve_free_huge_pages(unsigned long start_pfn,
|
extern int dissolve_free_huge_pages(unsigned long start_pfn,
|
||||||
unsigned long end_pfn);
|
unsigned long end_pfn);
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMORY_FAILURE
|
||||||
|
extern void hugetlb_clear_page_hwpoison(struct page *hpage);
|
||||||
|
#else
|
||||||
|
static inline void hugetlb_clear_page_hwpoison(struct page *hpage)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
|
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
|
||||||
#ifndef arch_hugetlb_migration_supported
|
#ifndef arch_hugetlb_migration_supported
|
||||||
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
|
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
|
||||||
|
@@ -3142,13 +3142,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
|
||||||
int vmemmap_remap_free(unsigned long start, unsigned long end,
|
|
||||||
unsigned long reuse);
|
|
||||||
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
|
|
||||||
unsigned long reuse, gfp_t gfp_mask);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void *sparse_buffer_alloc(unsigned long size);
|
void *sparse_buffer_alloc(unsigned long size);
|
||||||
struct page * __populate_section_memmap(unsigned long pfn,
|
struct page * __populate_section_memmap(unsigned long pfn,
|
||||||
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
|
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
|
||||||
@@ -3183,6 +3176,7 @@ enum mf_flags {
|
|||||||
MF_SOFT_OFFLINE = 1 << 3,
|
MF_SOFT_OFFLINE = 1 << 3,
|
||||||
MF_UNPOISON = 1 << 4,
|
MF_UNPOISON = 1 << 4,
|
||||||
MF_SW_SIMULATED = 1 << 5,
|
MF_SW_SIMULATED = 1 << 5,
|
||||||
|
MF_NO_RETRY = 1 << 6,
|
||||||
};
|
};
|
||||||
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
|
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
|
||||||
unsigned long count, int mf_flags);
|
unsigned long count, int mf_flags);
|
||||||
@@ -3235,7 +3229,6 @@ enum mf_action_page_type {
|
|||||||
MF_MSG_DIFFERENT_COMPOUND,
|
MF_MSG_DIFFERENT_COMPOUND,
|
||||||
MF_MSG_HUGE,
|
MF_MSG_HUGE,
|
||||||
MF_MSG_FREE_HUGE,
|
MF_MSG_FREE_HUGE,
|
||||||
MF_MSG_NON_PMD_HUGE,
|
|
||||||
MF_MSG_UNMAP_FAILED,
|
MF_MSG_UNMAP_FAILED,
|
||||||
MF_MSG_DIRTY_SWAPCACHE,
|
MF_MSG_DIRTY_SWAPCACHE,
|
||||||
MF_MSG_CLEAN_SWAPCACHE,
|
MF_MSG_CLEAN_SWAPCACHE,
|
||||||
|
@@ -205,34 +205,15 @@ enum pageflags {
|
|||||||
#ifndef __GENERATING_BOUNDS_H
|
#ifndef __GENERATING_BOUNDS_H
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||||
DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
|
DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
|
||||||
hugetlb_optimize_vmemmap_key);
|
|
||||||
|
|
||||||
static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
|
|
||||||
{
|
|
||||||
return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
|
|
||||||
&hugetlb_optimize_vmemmap_key);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the feature of optimizing vmemmap pages associated with each HugeTLB
|
* Return the real head page struct iff the @page is a fake head page, otherwise
|
||||||
* page is enabled, the head vmemmap page frame is reused and all of the tail
|
* return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
|
||||||
* vmemmap addresses map to the head vmemmap page frame (furture details can
|
|
||||||
* refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other
|
|
||||||
* words, there are more than one page struct with PG_head associated with each
|
|
||||||
* HugeTLB page. We __know__ that there is only one head page struct, the tail
|
|
||||||
* page structs with PG_head are fake head page structs. We need an approach
|
|
||||||
* to distinguish between those two different types of page structs so that
|
|
||||||
* compound_head() can return the real head page struct when the parameter is
|
|
||||||
* the tail page struct but with PG_head.
|
|
||||||
*
|
|
||||||
* The page_fixed_fake_head() returns the real head page struct if the @page is
|
|
||||||
* fake page head, otherwise, returns @page which can either be a true page
|
|
||||||
* head or tail.
|
|
||||||
*/
|
*/
|
||||||
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
|
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
|
||||||
{
|
{
|
||||||
if (!hugetlb_optimize_vmemmap_enabled())
|
if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
|
||||||
return page;
|
return page;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -260,11 +241,6 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
|
|||||||
{
|
{
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool hugetlb_optimize_vmemmap_enabled(void)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __always_inline int page_is_fake_head(struct page *page)
|
static __always_inline int page_is_fake_head(struct page *page)
|
||||||
|
@@ -490,6 +490,11 @@ static inline void num_poisoned_pages_dec(void)
|
|||||||
atomic_long_dec(&num_poisoned_pages);
|
atomic_long_dec(&num_poisoned_pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_sub(long i)
|
||||||
|
{
|
||||||
|
atomic_long_sub(i, &num_poisoned_pages);
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||||
@@ -505,6 +510,10 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
|
|||||||
static inline void num_poisoned_pages_inc(void)
|
static inline void num_poisoned_pages_inc(void)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_sub(long i)
|
||||||
|
{
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline int non_swap_entry(swp_entry_t entry)
|
static inline int non_swap_entry(swp_entry_t entry)
|
||||||
|
@@ -268,6 +268,10 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table *
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void register_sysctl_init(const char *path, struct ctl_table *table)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static inline struct ctl_table_header *register_sysctl_mount_point(const char *path)
|
static inline struct ctl_table_header *register_sysctl_mount_point(const char *path)
|
||||||
{
|
{
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@@ -360,7 +360,6 @@ TRACE_EVENT(aer_event,
|
|||||||
EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
|
EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
|
||||||
EM ( MF_MSG_HUGE, "huge page" ) \
|
EM ( MF_MSG_HUGE, "huge page" ) \
|
||||||
EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
|
EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
|
||||||
EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \
|
|
||||||
EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
|
EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
|
||||||
EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
|
EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
|
||||||
EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
|
EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
|
||||||
|
73
mm/hugetlb.c
73
mm/hugetlb.c
@@ -1535,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
|
|||||||
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
|
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (hugetlb_vmemmap_alloc(h, page)) {
|
/*
|
||||||
|
* If we don't know which subpages are hwpoisoned, we can't free
|
||||||
|
* the hugepage, so it's leaked intentionally.
|
||||||
|
*/
|
||||||
|
if (HPageRawHwpUnreliable(page))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (hugetlb_vmemmap_restore(h, page)) {
|
||||||
spin_lock_irq(&hugetlb_lock);
|
spin_lock_irq(&hugetlb_lock);
|
||||||
/*
|
/*
|
||||||
* If we cannot allocate vmemmap pages, just refuse to free the
|
* If we cannot allocate vmemmap pages, just refuse to free the
|
||||||
@@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Move PageHWPoison flag from head page to the raw error pages,
|
||||||
|
* which makes any healthy subpages reusable.
|
||||||
|
*/
|
||||||
|
if (unlikely(PageHWPoison(page)))
|
||||||
|
hugetlb_clear_page_hwpoison(page);
|
||||||
|
|
||||||
for (i = 0; i < pages_per_huge_page(h);
|
for (i = 0; i < pages_per_huge_page(h);
|
||||||
i++, subpage = mem_map_next(subpage, page, i)) {
|
i++, subpage = mem_map_next(subpage, page, i)) {
|
||||||
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
|
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
|
||||||
@@ -1612,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
|
|||||||
|
|
||||||
static inline void flush_free_hpage_work(struct hstate *h)
|
static inline void flush_free_hpage_work(struct hstate *h)
|
||||||
{
|
{
|
||||||
if (hugetlb_optimize_vmemmap_pages(h))
|
if (hugetlb_vmemmap_optimizable(h))
|
||||||
flush_work(&free_hpage_work);
|
flush_work(&free_hpage_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1734,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
|
|||||||
|
|
||||||
static void __prep_new_huge_page(struct hstate *h, struct page *page)
|
static void __prep_new_huge_page(struct hstate *h, struct page *page)
|
||||||
{
|
{
|
||||||
hugetlb_vmemmap_free(h, page);
|
hugetlb_vmemmap_optimize(h, page);
|
||||||
INIT_LIST_HEAD(&page->lru);
|
INIT_LIST_HEAD(&page->lru);
|
||||||
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
|
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
|
||||||
hugetlb_set_page_subpool(page, NULL);
|
hugetlb_set_page_subpool(page, NULL);
|
||||||
@@ -2107,17 +2121,8 @@ retry:
|
|||||||
* Attempt to allocate vmemmmap here so that we can take
|
* Attempt to allocate vmemmmap here so that we can take
|
||||||
* appropriate action on failure.
|
* appropriate action on failure.
|
||||||
*/
|
*/
|
||||||
rc = hugetlb_vmemmap_alloc(h, head);
|
rc = hugetlb_vmemmap_restore(h, head);
|
||||||
if (!rc) {
|
if (!rc) {
|
||||||
/*
|
|
||||||
* Move PageHWPoison flag from head page to the raw
|
|
||||||
* error page, which makes any subpages rather than
|
|
||||||
* the error page reusable.
|
|
||||||
*/
|
|
||||||
if (PageHWPoison(head) && page != head) {
|
|
||||||
SetPageHWPoison(page);
|
|
||||||
ClearPageHWPoison(head);
|
|
||||||
}
|
|
||||||
update_and_free_page(h, head, false);
|
update_and_free_page(h, head, false);
|
||||||
} else {
|
} else {
|
||||||
spin_lock_irq(&hugetlb_lock);
|
spin_lock_irq(&hugetlb_lock);
|
||||||
@@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h,
|
|||||||
/* Uncommit the reservation */
|
/* Uncommit the reservation */
|
||||||
h->resv_huge_pages -= unused_resv_pages;
|
h->resv_huge_pages -= unused_resv_pages;
|
||||||
|
|
||||||
/* Cannot return gigantic pages currently */
|
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
|
||||||
if (hstate_is_gigantic(h))
|
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -3182,8 +3186,10 @@ static void __init report_hugepages(void)
|
|||||||
char buf[32];
|
char buf[32];
|
||||||
|
|
||||||
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
|
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
|
||||||
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
|
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
|
||||||
buf, h->free_huge_pages);
|
buf, h->free_huge_pages);
|
||||||
|
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
|
||||||
|
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3421,7 +3427,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
|
|||||||
remove_hugetlb_page_for_demote(h, page, false);
|
remove_hugetlb_page_for_demote(h, page, false);
|
||||||
spin_unlock_irq(&hugetlb_lock);
|
spin_unlock_irq(&hugetlb_lock);
|
||||||
|
|
||||||
rc = hugetlb_vmemmap_alloc(h, page);
|
rc = hugetlb_vmemmap_restore(h, page);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
/* Allocation of vmemmmap failed, we can not demote page */
|
/* Allocation of vmemmmap failed, we can not demote page */
|
||||||
spin_lock_irq(&hugetlb_lock);
|
spin_lock_irq(&hugetlb_lock);
|
||||||
@@ -4111,7 +4117,6 @@ void __init hugetlb_add_hstate(unsigned int order)
|
|||||||
h->next_nid_to_free = first_memory_node;
|
h->next_nid_to_free = first_memory_node;
|
||||||
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
|
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
|
||||||
huge_page_size(h)/1024);
|
huge_page_size(h)/1024);
|
||||||
hugetlb_vmemmap_init(h);
|
|
||||||
|
|
||||||
parsed_hstate = h;
|
parsed_hstate = h;
|
||||||
}
|
}
|
||||||
@@ -6985,10 +6990,38 @@ struct page * __weak
|
|||||||
follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
||||||
pud_t *pud, int flags)
|
pud_t *pud, int flags)
|
||||||
{
|
{
|
||||||
if (flags & (FOLL_GET | FOLL_PIN))
|
struct page *page = NULL;
|
||||||
|
spinlock_t *ptl;
|
||||||
|
pte_t pte;
|
||||||
|
|
||||||
|
if (WARN_ON_ONCE(flags & FOLL_PIN))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
|
retry:
|
||||||
|
ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
|
||||||
|
if (!pud_huge(*pud))
|
||||||
|
goto out;
|
||||||
|
pte = huge_ptep_get((pte_t *)pud);
|
||||||
|
if (pte_present(pte)) {
|
||||||
|
page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
|
||||||
|
if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
|
||||||
|
page = NULL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (is_hugetlb_entry_migration(pte)) {
|
||||||
|
spin_unlock(ptl);
|
||||||
|
__migration_entry_wait(mm, (pte_t *)pud, ptl);
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* hwpoisoned entry is treated as no_page_table in
|
||||||
|
* follow_page_mask().
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
spin_unlock(ptl);
|
||||||
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct page * __weak
|
struct page * __weak
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
// SPDX-License-Identifier: GPL-2.0
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
/*
|
/*
|
||||||
* Optimize vmemmap pages associated with HugeTLB
|
* HugeTLB Vmemmap Optimization (HVO)
|
||||||
*
|
*
|
||||||
* Copyright (c) 2020, Bytedance. All rights reserved.
|
* Copyright (c) 2020, ByteDance. All rights reserved.
|
||||||
*
|
*
|
||||||
* Author: Muchun Song <songmuchun@bytedance.com>
|
* Author: Muchun Song <songmuchun@bytedance.com>
|
||||||
*
|
*
|
||||||
@@ -10,84 +10,443 @@
|
|||||||
*/
|
*/
|
||||||
#define pr_fmt(fmt) "HugeTLB: " fmt
|
#define pr_fmt(fmt) "HugeTLB: " fmt
|
||||||
|
|
||||||
#include <linux/memory.h>
|
#include <linux/pgtable.h>
|
||||||
|
#include <linux/bootmem_info.h>
|
||||||
|
#include <asm/pgalloc.h>
|
||||||
|
#include <asm/tlbflush.h>
|
||||||
#include "hugetlb_vmemmap.h"
|
#include "hugetlb_vmemmap.h"
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* There are a lot of struct page structures associated with each HugeTLB page.
|
* struct vmemmap_remap_walk - walk vmemmap page table
|
||||||
* For tail pages, the value of compound_head is the same. So we can reuse first
|
*
|
||||||
* page of head page structures. We map the virtual addresses of all the pages
|
* @remap_pte: called for each lowest-level entry (PTE).
|
||||||
* of tail page structures to the head page struct, and then free these page
|
* @nr_walked: the number of walked pte.
|
||||||
* frames. Therefore, we need to reserve one pages as vmemmap areas.
|
* @reuse_page: the page which is reused for the tail vmemmap pages.
|
||||||
|
* @reuse_addr: the virtual address of the @reuse_page page.
|
||||||
|
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
|
||||||
|
* or is mapped from.
|
||||||
*/
|
*/
|
||||||
#define RESERVE_VMEMMAP_NR 1U
|
struct vmemmap_remap_walk {
|
||||||
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
|
void (*remap_pte)(pte_t *pte, unsigned long addr,
|
||||||
|
struct vmemmap_remap_walk *walk);
|
||||||
enum vmemmap_optimize_mode {
|
unsigned long nr_walked;
|
||||||
VMEMMAP_OPTIMIZE_OFF,
|
struct page *reuse_page;
|
||||||
VMEMMAP_OPTIMIZE_ON,
|
unsigned long reuse_addr;
|
||||||
|
struct list_head *vmemmap_pages;
|
||||||
};
|
};
|
||||||
|
|
||||||
DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
|
static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
|
||||||
hugetlb_optimize_vmemmap_key);
|
|
||||||
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
|
|
||||||
|
|
||||||
static enum vmemmap_optimize_mode vmemmap_optimize_mode =
|
|
||||||
IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
|
|
||||||
|
|
||||||
static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
|
|
||||||
{
|
{
|
||||||
if (vmemmap_optimize_mode == to)
|
pmd_t __pmd;
|
||||||
return;
|
int i;
|
||||||
|
unsigned long addr = start;
|
||||||
|
struct page *page = pmd_page(*pmd);
|
||||||
|
pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
|
||||||
|
|
||||||
if (to == VMEMMAP_OPTIMIZE_OFF)
|
if (!pgtable)
|
||||||
static_branch_dec(&hugetlb_optimize_vmemmap_key);
|
return -ENOMEM;
|
||||||
else
|
|
||||||
static_branch_inc(&hugetlb_optimize_vmemmap_key);
|
|
||||||
WRITE_ONCE(vmemmap_optimize_mode, to);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init hugetlb_vmemmap_early_param(char *buf)
|
pmd_populate_kernel(&init_mm, &__pmd, pgtable);
|
||||||
{
|
|
||||||
bool enable;
|
|
||||||
enum vmemmap_optimize_mode mode;
|
|
||||||
|
|
||||||
if (kstrtobool(buf, &enable))
|
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
|
||||||
return -EINVAL;
|
pte_t entry, *pte;
|
||||||
|
pgprot_t pgprot = PAGE_KERNEL;
|
||||||
|
|
||||||
mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
|
entry = mk_pte(page + i, pgprot);
|
||||||
vmemmap_optimize_mode_switch(mode);
|
pte = pte_offset_kernel(&__pmd, addr);
|
||||||
|
set_pte_at(&init_mm, addr, pte, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
spin_lock(&init_mm.page_table_lock);
|
||||||
|
if (likely(pmd_leaf(*pmd))) {
|
||||||
|
/*
|
||||||
|
* Higher order allocations from buddy allocator must be able to
|
||||||
|
* be treated as indepdenent small pages (as they can be freed
|
||||||
|
* individually).
|
||||||
|
*/
|
||||||
|
if (!PageReserved(page))
|
||||||
|
split_page(page, get_order(PMD_SIZE));
|
||||||
|
|
||||||
|
/* Make pte visible before pmd. See comment in pmd_install(). */
|
||||||
|
smp_wmb();
|
||||||
|
pmd_populate_kernel(&init_mm, pmd, pgtable);
|
||||||
|
flush_tlb_kernel_range(start, start + PMD_SIZE);
|
||||||
|
} else {
|
||||||
|
pte_free_kernel(&init_mm, pgtable);
|
||||||
|
}
|
||||||
|
spin_unlock(&init_mm.page_table_lock);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
|
||||||
|
{
|
||||||
|
int leaf;
|
||||||
|
|
||||||
|
spin_lock(&init_mm.page_table_lock);
|
||||||
|
leaf = pmd_leaf(*pmd);
|
||||||
|
spin_unlock(&init_mm.page_table_lock);
|
||||||
|
|
||||||
|
if (!leaf)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return __split_vmemmap_huge_pmd(pmd, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
|
||||||
|
unsigned long end,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
pte_t *pte = pte_offset_kernel(pmd, addr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The reuse_page is found 'first' in table walk before we start
|
||||||
|
* remapping (which is calling @walk->remap_pte).
|
||||||
|
*/
|
||||||
|
if (!walk->reuse_page) {
|
||||||
|
walk->reuse_page = pte_page(*pte);
|
||||||
|
/*
|
||||||
|
* Because the reuse address is part of the range that we are
|
||||||
|
* walking, skip the reuse address range.
|
||||||
|
*/
|
||||||
|
addr += PAGE_SIZE;
|
||||||
|
pte++;
|
||||||
|
walk->nr_walked++;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; addr != end; addr += PAGE_SIZE, pte++) {
|
||||||
|
walk->remap_pte(pte, addr, walk);
|
||||||
|
walk->nr_walked++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
|
||||||
|
unsigned long end,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
pmd_t *pmd;
|
||||||
|
unsigned long next;
|
||||||
|
|
||||||
|
pmd = pmd_offset(pud, addr);
|
||||||
|
do {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
next = pmd_addr_end(addr, end);
|
||||||
|
vmemmap_pte_range(pmd, addr, next, walk);
|
||||||
|
} while (pmd++, addr = next, addr != end);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
|
||||||
|
unsigned long end,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
pud_t *pud;
|
||||||
|
unsigned long next;
|
||||||
|
|
||||||
|
pud = pud_offset(p4d, addr);
|
||||||
|
do {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
next = pud_addr_end(addr, end);
|
||||||
|
ret = vmemmap_pmd_range(pud, addr, next, walk);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
} while (pud++, addr = next, addr != end);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
|
||||||
|
unsigned long end,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
p4d_t *p4d;
|
||||||
|
unsigned long next;
|
||||||
|
|
||||||
|
p4d = p4d_offset(pgd, addr);
|
||||||
|
do {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
next = p4d_addr_end(addr, end);
|
||||||
|
ret = vmemmap_pud_range(p4d, addr, next, walk);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
} while (p4d++, addr = next, addr != end);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int vmemmap_remap_range(unsigned long start, unsigned long end,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
unsigned long addr = start;
|
||||||
|
unsigned long next;
|
||||||
|
pgd_t *pgd;
|
||||||
|
|
||||||
|
VM_BUG_ON(!PAGE_ALIGNED(start));
|
||||||
|
VM_BUG_ON(!PAGE_ALIGNED(end));
|
||||||
|
|
||||||
|
pgd = pgd_offset_k(addr);
|
||||||
|
do {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
next = pgd_addr_end(addr, end);
|
||||||
|
ret = vmemmap_p4d_range(pgd, addr, next, walk);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
} while (pgd++, addr = next, addr != end);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We only change the mapping of the vmemmap virtual address range
|
||||||
|
* [@start + PAGE_SIZE, end), so we only need to flush the TLB which
|
||||||
|
* belongs to the range.
|
||||||
|
*/
|
||||||
|
flush_tlb_kernel_range(start + PAGE_SIZE, end);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Previously discarded vmemmap pages will be allocated and remapping
|
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
|
||||||
* after this function returns zero.
|
* allocator or buddy allocator. If the PG_reserved flag is set, it means
|
||||||
|
* that it allocated from the memblock allocator, just free it via the
|
||||||
|
* free_bootmem_page(). Otherwise, use __free_page().
|
||||||
*/
|
*/
|
||||||
int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
|
static inline void free_vmemmap_page(struct page *page)
|
||||||
|
{
|
||||||
|
if (PageReserved(page))
|
||||||
|
free_bootmem_page(page);
|
||||||
|
else
|
||||||
|
__free_page(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Free a list of the vmemmap pages */
|
||||||
|
static void free_vmemmap_page_list(struct list_head *list)
|
||||||
|
{
|
||||||
|
struct page *page, *next;
|
||||||
|
|
||||||
|
list_for_each_entry_safe(page, next, list, lru) {
|
||||||
|
list_del(&page->lru);
|
||||||
|
free_vmemmap_page(page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Remap the tail pages as read-only to catch illegal write operation
|
||||||
|
* to the tail pages.
|
||||||
|
*/
|
||||||
|
pgprot_t pgprot = PAGE_KERNEL_RO;
|
||||||
|
pte_t entry = mk_pte(walk->reuse_page, pgprot);
|
||||||
|
struct page *page = pte_page(*pte);
|
||||||
|
|
||||||
|
list_add_tail(&page->lru, walk->vmemmap_pages);
|
||||||
|
set_pte_at(&init_mm, addr, pte, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* How many struct page structs need to be reset. When we reuse the head
|
||||||
|
* struct page, the special metadata (e.g. page->flags or page->mapping)
|
||||||
|
* cannot copy to the tail struct page structs. The invalid value will be
|
||||||
|
* checked in the free_tail_pages_check(). In order to avoid the message
|
||||||
|
* of "corrupted mapping in tail page". We need to reset at least 3 (one
|
||||||
|
* head struct page struct and two tail struct page structs) struct page
|
||||||
|
* structs.
|
||||||
|
*/
|
||||||
|
#define NR_RESET_STRUCT_PAGE 3
|
||||||
|
|
||||||
|
static inline void reset_struct_pages(struct page *start)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct page *from = start + NR_RESET_STRUCT_PAGE;
|
||||||
|
|
||||||
|
for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
|
||||||
|
memcpy(start + i, from, sizeof(*from));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
|
||||||
|
struct vmemmap_remap_walk *walk)
|
||||||
|
{
|
||||||
|
pgprot_t pgprot = PAGE_KERNEL;
|
||||||
|
struct page *page;
|
||||||
|
void *to;
|
||||||
|
|
||||||
|
BUG_ON(pte_page(*pte) != walk->reuse_page);
|
||||||
|
|
||||||
|
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
|
||||||
|
list_del(&page->lru);
|
||||||
|
to = page_to_virt(page);
|
||||||
|
copy_page(to, (void *)walk->reuse_addr);
|
||||||
|
reset_struct_pages(to);
|
||||||
|
|
||||||
|
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
|
||||||
|
* to the page which @reuse is mapped to, then free vmemmap
|
||||||
|
* which the range are mapped to.
|
||||||
|
* @start: start address of the vmemmap virtual address range that we want
|
||||||
|
* to remap.
|
||||||
|
* @end: end address of the vmemmap virtual address range that we want to
|
||||||
|
* remap.
|
||||||
|
* @reuse: reuse address.
|
||||||
|
*
|
||||||
|
* Return: %0 on success, negative error code otherwise.
|
||||||
|
*/
|
||||||
|
static int vmemmap_remap_free(unsigned long start, unsigned long end,
|
||||||
|
unsigned long reuse)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
unsigned long vmemmap_addr = (unsigned long)head;
|
LIST_HEAD(vmemmap_pages);
|
||||||
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
|
struct vmemmap_remap_walk walk = {
|
||||||
|
.remap_pte = vmemmap_remap_pte,
|
||||||
|
.reuse_addr = reuse,
|
||||||
|
.vmemmap_pages = &vmemmap_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In order to make remapping routine most efficient for the huge pages,
|
||||||
|
* the routine of vmemmap page table walking has the following rules
|
||||||
|
* (see more details from the vmemmap_pte_range()):
|
||||||
|
*
|
||||||
|
* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
|
||||||
|
* should be continuous.
|
||||||
|
* - The @reuse address is part of the range [@reuse, @end) that we are
|
||||||
|
* walking which is passed to vmemmap_remap_range().
|
||||||
|
* - The @reuse address is the first in the complete range.
|
||||||
|
*
|
||||||
|
* So we need to make sure that @start and @reuse meet the above rules.
|
||||||
|
*/
|
||||||
|
BUG_ON(start - reuse != PAGE_SIZE);
|
||||||
|
|
||||||
|
mmap_read_lock(&init_mm);
|
||||||
|
ret = vmemmap_remap_range(reuse, end, &walk);
|
||||||
|
if (ret && walk.nr_walked) {
|
||||||
|
end = reuse + walk.nr_walked * PAGE_SIZE;
|
||||||
|
/*
|
||||||
|
* vmemmap_pages contains pages from the previous
|
||||||
|
* vmemmap_remap_range call which failed. These
|
||||||
|
* are pages which were removed from the vmemmap.
|
||||||
|
* They will be restored in the following call.
|
||||||
|
*/
|
||||||
|
walk = (struct vmemmap_remap_walk) {
|
||||||
|
.remap_pte = vmemmap_restore_pte,
|
||||||
|
.reuse_addr = reuse,
|
||||||
|
.vmemmap_pages = &vmemmap_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
vmemmap_remap_range(reuse, end, &walk);
|
||||||
|
}
|
||||||
|
mmap_read_unlock(&init_mm);
|
||||||
|
|
||||||
|
free_vmemmap_page_list(&vmemmap_pages);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
|
||||||
|
gfp_t gfp_mask, struct list_head *list)
|
||||||
|
{
|
||||||
|
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
|
||||||
|
int nid = page_to_nid((struct page *)start);
|
||||||
|
struct page *page, *next;
|
||||||
|
|
||||||
|
while (nr_pages--) {
|
||||||
|
page = alloc_pages_node(nid, gfp_mask, 0);
|
||||||
|
if (!page)
|
||||||
|
goto out;
|
||||||
|
list_add_tail(&page->lru, list);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
out:
|
||||||
|
list_for_each_entry_safe(page, next, list, lru)
|
||||||
|
__free_pages(page, 0);
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
|
||||||
|
* to the page which is from the @vmemmap_pages
|
||||||
|
* respectively.
|
||||||
|
* @start: start address of the vmemmap virtual address range that we want
|
||||||
|
* to remap.
|
||||||
|
* @end: end address of the vmemmap virtual address range that we want to
|
||||||
|
* remap.
|
||||||
|
* @reuse: reuse address.
|
||||||
|
* @gfp_mask: GFP flag for allocating vmemmap pages.
|
||||||
|
*
|
||||||
|
* Return: %0 on success, negative error code otherwise.
|
||||||
|
*/
|
||||||
|
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
|
||||||
|
unsigned long reuse, gfp_t gfp_mask)
|
||||||
|
{
|
||||||
|
LIST_HEAD(vmemmap_pages);
|
||||||
|
struct vmemmap_remap_walk walk = {
|
||||||
|
.remap_pte = vmemmap_restore_pte,
|
||||||
|
.reuse_addr = reuse,
|
||||||
|
.vmemmap_pages = &vmemmap_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* See the comment in the vmemmap_remap_free(). */
|
||||||
|
BUG_ON(start - reuse != PAGE_SIZE);
|
||||||
|
|
||||||
|
if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
mmap_read_lock(&init_mm);
|
||||||
|
vmemmap_remap_range(reuse, end, &walk);
|
||||||
|
mmap_read_unlock(&init_mm);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
|
||||||
|
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
|
||||||
|
|
||||||
|
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
|
||||||
|
core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* hugetlb_vmemmap_restore - restore previously optimized (by
|
||||||
|
* hugetlb_vmemmap_optimize()) vmemmap pages which
|
||||||
|
* will be reallocated and remapped.
|
||||||
|
* @h: struct hstate.
|
||||||
|
* @head: the head page whose vmemmap pages will be restored.
|
||||||
|
*
|
||||||
|
* Return: %0 if @head's vmemmap pages have been reallocated and remapped,
|
||||||
|
* negative error code otherwise.
|
||||||
|
*/
|
||||||
|
int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
|
||||||
|
unsigned long vmemmap_reuse;
|
||||||
|
|
||||||
if (!HPageVmemmapOptimized(head))
|
if (!HPageVmemmapOptimized(head))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
|
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
|
||||||
vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
|
vmemmap_reuse = vmemmap_start;
|
||||||
vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
|
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
|
||||||
vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The pages which the vmemmap virtual address range [@vmemmap_addr,
|
* The pages which the vmemmap virtual address range [@vmemmap_start,
|
||||||
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
|
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
|
||||||
* the range is mapped to the page which @vmemmap_reuse is mapped to.
|
* the range is mapped to the page which @vmemmap_reuse is mapped to.
|
||||||
* When a HugeTLB page is freed to the buddy allocator, previously
|
* When a HugeTLB page is freed to the buddy allocator, previously
|
||||||
* discarded vmemmap pages must be allocated and remapping.
|
* discarded vmemmap pages must be allocated and remapping.
|
||||||
*/
|
*/
|
||||||
ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
|
ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
|
||||||
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
|
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
ClearHPageVmemmapOptimized(head);
|
ClearHPageVmemmapOptimized(head);
|
||||||
@@ -97,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned int vmemmap_optimizable_pages(struct hstate *h,
|
/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
|
||||||
struct page *head)
|
static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
|
||||||
{
|
{
|
||||||
if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
|
if (!READ_ONCE(vmemmap_optimize_enabled))
|
||||||
return 0;
|
return false;
|
||||||
|
|
||||||
|
if (!hugetlb_vmemmap_optimizable(h))
|
||||||
|
return false;
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
|
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
|
||||||
pmd_t *pmdp, pmd;
|
pmd_t *pmdp, pmd;
|
||||||
@@ -144,118 +506,73 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h,
|
|||||||
* +-------------------------------------------+
|
* +-------------------------------------------+
|
||||||
*/
|
*/
|
||||||
if (PageVmemmapSelfHosted(vmemmap_page))
|
if (PageVmemmapSelfHosted(vmemmap_page))
|
||||||
return 0;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return hugetlb_optimize_vmemmap_pages(h);
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
|
/**
|
||||||
|
* hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
|
||||||
|
* @h: struct hstate.
|
||||||
|
* @head: the head page whose vmemmap pages will be optimized.
|
||||||
|
*
|
||||||
|
* This function only tries to optimize @head's vmemmap pages and does not
|
||||||
|
* guarantee that the optimization will succeed after it returns. The caller
|
||||||
|
* can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
|
||||||
|
* have been optimized.
|
||||||
|
*/
|
||||||
|
void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
|
||||||
{
|
{
|
||||||
unsigned long vmemmap_addr = (unsigned long)head;
|
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
|
||||||
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
|
unsigned long vmemmap_reuse;
|
||||||
|
|
||||||
vmemmap_pages = vmemmap_optimizable_pages(h, head);
|
if (!vmemmap_should_optimize(h, head))
|
||||||
if (!vmemmap_pages)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
static_branch_inc(&hugetlb_optimize_vmemmap_key);
|
static_branch_inc(&hugetlb_optimize_vmemmap_key);
|
||||||
|
|
||||||
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
|
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
|
||||||
vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
|
vmemmap_reuse = vmemmap_start;
|
||||||
vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
|
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
|
* Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
|
||||||
* to the page which @vmemmap_reuse is mapped to, then free the pages
|
* to the page which @vmemmap_reuse is mapped to, then free the pages
|
||||||
* which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
|
* which the range [@vmemmap_start, @vmemmap_end] is mapped to.
|
||||||
*/
|
*/
|
||||||
if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
|
if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
|
||||||
static_branch_dec(&hugetlb_optimize_vmemmap_key);
|
static_branch_dec(&hugetlb_optimize_vmemmap_key);
|
||||||
else
|
else
|
||||||
SetHPageVmemmapOptimized(head);
|
SetHPageVmemmapOptimized(head);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init hugetlb_vmemmap_init(struct hstate *h)
|
|
||||||
{
|
|
||||||
unsigned int nr_pages = pages_per_huge_page(h);
|
|
||||||
unsigned int vmemmap_pages;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
|
|
||||||
* page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
|
|
||||||
* so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
|
|
||||||
*/
|
|
||||||
BUILD_BUG_ON(__NR_USED_SUBPAGE >=
|
|
||||||
RESERVE_VMEMMAP_SIZE / sizeof(struct page));
|
|
||||||
|
|
||||||
if (!is_power_of_2(sizeof(struct page))) {
|
|
||||||
pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
|
|
||||||
static_branch_disable(&hugetlb_optimize_vmemmap_key);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
|
|
||||||
/*
|
|
||||||
* The head page is not to be freed to buddy allocator, the other tail
|
|
||||||
* pages will map to the head page, so they can be freed.
|
|
||||||
*
|
|
||||||
* Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
|
|
||||||
* on some architectures (e.g. aarch64). See Documentation/arm64/
|
|
||||||
* hugetlbpage.rst for more details.
|
|
||||||
*/
|
|
||||||
if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
|
|
||||||
h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
|
|
||||||
|
|
||||||
pr_info("can optimize %d vmemmap pages for %s\n",
|
|
||||||
h->optimize_vmemmap_pages, h->name);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_SYSCTL
|
|
||||||
static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
|
|
||||||
void *buffer, size_t *length,
|
|
||||||
loff_t *ppos)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
enum vmemmap_optimize_mode mode;
|
|
||||||
static DEFINE_MUTEX(sysctl_mutex);
|
|
||||||
|
|
||||||
if (write && !capable(CAP_SYS_ADMIN))
|
|
||||||
return -EPERM;
|
|
||||||
|
|
||||||
mutex_lock(&sysctl_mutex);
|
|
||||||
mode = vmemmap_optimize_mode;
|
|
||||||
table->data = &mode;
|
|
||||||
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
|
|
||||||
if (write && !ret)
|
|
||||||
vmemmap_optimize_mode_switch(mode);
|
|
||||||
mutex_unlock(&sysctl_mutex);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ctl_table hugetlb_vmemmap_sysctls[] = {
|
static struct ctl_table hugetlb_vmemmap_sysctls[] = {
|
||||||
{
|
{
|
||||||
.procname = "hugetlb_optimize_vmemmap",
|
.procname = "hugetlb_optimize_vmemmap",
|
||||||
.maxlen = sizeof(enum vmemmap_optimize_mode),
|
.data = &vmemmap_optimize_enabled,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = hugetlb_optimize_vmemmap_handler,
|
.proc_handler = proc_dobool,
|
||||||
.extra1 = SYSCTL_ZERO,
|
|
||||||
.extra2 = SYSCTL_ONE,
|
|
||||||
},
|
},
|
||||||
{ }
|
{ }
|
||||||
};
|
};
|
||||||
|
|
||||||
static __init int hugetlb_vmemmap_sysctls_init(void)
|
static int __init hugetlb_vmemmap_init(void)
|
||||||
{
|
{
|
||||||
/*
|
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
|
||||||
* If "struct page" crosses page boundaries, the vmemmap pages cannot
|
BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
|
||||||
* be optimized.
|
|
||||||
*/
|
|
||||||
if (is_power_of_2(sizeof(struct page)))
|
|
||||||
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
|
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
|
||||||
|
const struct hstate *h;
|
||||||
|
|
||||||
|
for_each_hstate(h) {
|
||||||
|
if (hugetlb_vmemmap_optimizable(h)) {
|
||||||
|
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
late_initcall(hugetlb_vmemmap_sysctls_init);
|
late_initcall(hugetlb_vmemmap_init);
|
||||||
#endif /* CONFIG_PROC_SYSCTL */
|
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
// SPDX-License-Identifier: GPL-2.0
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
/*
|
/*
|
||||||
* Optimize vmemmap pages associated with HugeTLB
|
* HugeTLB Vmemmap Optimization (HVO)
|
||||||
*
|
*
|
||||||
* Copyright (c) 2020, Bytedance. All rights reserved.
|
* Copyright (c) 2020, ByteDance. All rights reserved.
|
||||||
*
|
*
|
||||||
* Author: Muchun Song <songmuchun@bytedance.com>
|
* Author: Muchun Song <songmuchun@bytedance.com>
|
||||||
*/
|
*/
|
||||||
@@ -11,35 +11,50 @@
|
|||||||
#include <linux/hugetlb.h>
|
#include <linux/hugetlb.h>
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||||
int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
|
int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
|
||||||
void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
|
void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
|
||||||
void hugetlb_vmemmap_init(struct hstate *h);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* How many vmemmap pages associated with a HugeTLB page that can be
|
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
|
||||||
* optimized and freed to the buddy allocator.
|
* Documentation/vm/vmemmap_dedup.rst.
|
||||||
*/
|
*/
|
||||||
static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
|
#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE
|
||||||
|
|
||||||
|
static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
|
||||||
{
|
{
|
||||||
return h->optimize_vmemmap_pages;
|
return pages_per_huge_page(h) * sizeof(struct page);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return how many vmemmap size associated with a HugeTLB page that can be
|
||||||
|
* optimized and can be freed to the buddy allocator.
|
||||||
|
*/
|
||||||
|
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
|
||||||
|
{
|
||||||
|
int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;
|
||||||
|
|
||||||
|
if (!is_power_of_2(sizeof(struct page)))
|
||||||
|
return 0;
|
||||||
|
return size > 0 ? size : 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
|
static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
|
static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void hugetlb_vmemmap_init(struct hstate *h)
|
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
|
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
|
#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
|
||||||
|
|
||||||
|
static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
|
||||||
|
{
|
||||||
|
return hugetlb_vmemmap_optimizable_size(h) != 0;
|
||||||
|
}
|
||||||
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
|
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
|
||||||
|
@@ -74,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
|
|||||||
|
|
||||||
static bool hw_memory_failure __read_mostly = false;
|
static bool hw_memory_failure __read_mostly = false;
|
||||||
|
|
||||||
static bool __page_handle_poison(struct page *page)
|
/*
|
||||||
|
* Return values:
|
||||||
|
* 1: the page is dissolved (if needed) and taken off from buddy,
|
||||||
|
* 0: the page is dissolved (if needed) and not taken off from buddy,
|
||||||
|
* < 0: failed to dissolve.
|
||||||
|
*/
|
||||||
|
static int __page_handle_poison(struct page *page)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
@@ -84,7 +90,7 @@ static bool __page_handle_poison(struct page *page)
|
|||||||
ret = take_page_off_buddy(page);
|
ret = take_page_off_buddy(page);
|
||||||
zone_pcp_enable(page_zone(page));
|
zone_pcp_enable(page_zone(page));
|
||||||
|
|
||||||
return ret > 0;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
|
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
|
||||||
@@ -94,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
|
|||||||
* Doing this check for free pages is also fine since dissolve_free_huge_page
|
* Doing this check for free pages is also fine since dissolve_free_huge_page
|
||||||
* returns 0 for non-hugetlb pages as well.
|
* returns 0 for non-hugetlb pages as well.
|
||||||
*/
|
*/
|
||||||
if (!__page_handle_poison(page))
|
if (__page_handle_poison(page) <= 0)
|
||||||
/*
|
/*
|
||||||
* We could fail to take off the target page from buddy
|
* We could fail to take off the target page from buddy
|
||||||
* for example due to racy page allocation, but that's
|
* for example due to racy page allocation, but that's
|
||||||
@@ -762,7 +768,6 @@ static const char * const action_page_types[] = {
|
|||||||
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
|
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
|
||||||
[MF_MSG_HUGE] = "huge page",
|
[MF_MSG_HUGE] = "huge page",
|
||||||
[MF_MSG_FREE_HUGE] = "free huge page",
|
[MF_MSG_FREE_HUGE] = "free huge page",
|
||||||
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
|
|
||||||
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
|
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
|
||||||
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
|
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
|
||||||
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
|
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
|
||||||
@@ -1078,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
|
|||||||
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
|
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
} else {
|
} else {
|
||||||
res = MF_FAILED;
|
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
/*
|
/*
|
||||||
* migration entry prevents later access on error hugepage,
|
* migration entry prevents later access on error hugepage,
|
||||||
@@ -1086,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p)
|
|||||||
* subpages.
|
* subpages.
|
||||||
*/
|
*/
|
||||||
put_page(hpage);
|
put_page(hpage);
|
||||||
if (__page_handle_poison(p)) {
|
if (__page_handle_poison(p) >= 0) {
|
||||||
page_ref_inc(p);
|
page_ref_inc(p);
|
||||||
res = MF_RECOVERED;
|
res = MF_RECOVERED;
|
||||||
|
} else {
|
||||||
|
res = MF_FAILED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1662,6 +1668,113 @@ unlock:
|
|||||||
EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
|
EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
|
||||||
#endif /* CONFIG_FS_DAX */
|
#endif /* CONFIG_FS_DAX */
|
||||||
|
|
||||||
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
|
/*
|
||||||
|
* Struct raw_hwp_page represents information about "raw error page",
|
||||||
|
* constructing singly linked list originated from ->private field of
|
||||||
|
* SUBPAGE_INDEX_HWPOISON-th tail page.
|
||||||
|
*/
|
||||||
|
struct raw_hwp_page {
|
||||||
|
struct llist_node node;
|
||||||
|
struct page *page;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
|
||||||
|
{
|
||||||
|
return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
|
||||||
|
{
|
||||||
|
struct llist_head *head;
|
||||||
|
struct llist_node *t, *tnode;
|
||||||
|
unsigned long count = 0;
|
||||||
|
|
||||||
|
head = raw_hwp_list_head(hpage);
|
||||||
|
llist_for_each_safe(tnode, t, head->first) {
|
||||||
|
struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
|
||||||
|
|
||||||
|
if (move_flag)
|
||||||
|
SetPageHWPoison(p->page);
|
||||||
|
kfree(p);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
llist_del_all(head);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
|
||||||
|
{
|
||||||
|
struct llist_head *head;
|
||||||
|
struct raw_hwp_page *raw_hwp;
|
||||||
|
struct llist_node *t, *tnode;
|
||||||
|
int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Once the hwpoison hugepage has lost reliable raw error info,
|
||||||
|
* there is little meaning to keep additional error info precisely,
|
||||||
|
* so skip to add additional raw error info.
|
||||||
|
*/
|
||||||
|
if (HPageRawHwpUnreliable(hpage))
|
||||||
|
return -EHWPOISON;
|
||||||
|
head = raw_hwp_list_head(hpage);
|
||||||
|
llist_for_each_safe(tnode, t, head->first) {
|
||||||
|
struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
|
||||||
|
|
||||||
|
if (p->page == page)
|
||||||
|
return -EHWPOISON;
|
||||||
|
}
|
||||||
|
|
||||||
|
raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
|
||||||
|
if (raw_hwp) {
|
||||||
|
raw_hwp->page = page;
|
||||||
|
llist_add(&raw_hwp->node, head);
|
||||||
|
/* the first error event will be counted in action_result(). */
|
||||||
|
if (ret)
|
||||||
|
num_poisoned_pages_inc();
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Failed to save raw error info. We no longer trace all
|
||||||
|
* hwpoisoned subpages, and we need refuse to free/dissolve
|
||||||
|
* this hwpoisoned hugepage.
|
||||||
|
*/
|
||||||
|
SetHPageRawHwpUnreliable(hpage);
|
||||||
|
/*
|
||||||
|
* Once HPageRawHwpUnreliable is set, raw_hwp_page is not
|
||||||
|
* used any more, so free it.
|
||||||
|
*/
|
||||||
|
__free_raw_hwp_pages(hpage, false);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* HPageVmemmapOptimized hugepages can't be freed because struct
|
||||||
|
* pages for tail pages are required but they don't exist.
|
||||||
|
*/
|
||||||
|
if (move_flag && HPageVmemmapOptimized(hpage))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
|
||||||
|
* definition.
|
||||||
|
*/
|
||||||
|
if (HPageRawHwpUnreliable(hpage))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return __free_raw_hwp_pages(hpage, move_flag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void hugetlb_clear_page_hwpoison(struct page *hpage)
|
||||||
|
{
|
||||||
|
if (HPageRawHwpUnreliable(hpage))
|
||||||
|
return;
|
||||||
|
ClearPageHWPoison(hpage);
|
||||||
|
free_raw_hwp_pages(hpage, true);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Called from hugetlb code with hugetlb_lock held.
|
* Called from hugetlb code with hugetlb_lock held.
|
||||||
*
|
*
|
||||||
@@ -1693,10 +1806,11 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
|
|||||||
count_increased = true;
|
count_increased = true;
|
||||||
} else {
|
} else {
|
||||||
ret = -EBUSY;
|
ret = -EBUSY;
|
||||||
goto out;
|
if (!(flags & MF_NO_RETRY))
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (TestSetPageHWPoison(head)) {
|
if (hugetlb_set_page_hwpoison(head, page)) {
|
||||||
ret = -EHWPOISON;
|
ret = -EHWPOISON;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -1708,7 +1822,6 @@ out:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
|
||||||
/*
|
/*
|
||||||
* Taking refcount of hugetlb pages needs extra care about race conditions
|
* Taking refcount of hugetlb pages needs extra care about race conditions
|
||||||
* with basic operations like hugepage allocation/free/demotion.
|
* with basic operations like hugepage allocation/free/demotion.
|
||||||
@@ -1721,7 +1834,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
|
|||||||
struct page *p = pfn_to_page(pfn);
|
struct page *p = pfn_to_page(pfn);
|
||||||
struct page *head;
|
struct page *head;
|
||||||
unsigned long page_flags;
|
unsigned long page_flags;
|
||||||
bool retry = true;
|
|
||||||
|
|
||||||
*hugetlb = 1;
|
*hugetlb = 1;
|
||||||
retry:
|
retry:
|
||||||
@@ -1737,8 +1849,8 @@ retry:
|
|||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
} else if (res == -EBUSY) {
|
} else if (res == -EBUSY) {
|
||||||
if (retry) {
|
if (!(flags & MF_NO_RETRY)) {
|
||||||
retry = false;
|
flags |= MF_NO_RETRY;
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
|
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
|
||||||
@@ -1749,7 +1861,7 @@ retry:
|
|||||||
lock_page(head);
|
lock_page(head);
|
||||||
|
|
||||||
if (hwpoison_filter(p)) {
|
if (hwpoison_filter(p)) {
|
||||||
ClearPageHWPoison(head);
|
hugetlb_clear_page_hwpoison(head);
|
||||||
res = -EOPNOTSUPP;
|
res = -EOPNOTSUPP;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -1760,10 +1872,11 @@ retry:
|
|||||||
*/
|
*/
|
||||||
if (res == 0) {
|
if (res == 0) {
|
||||||
unlock_page(head);
|
unlock_page(head);
|
||||||
res = MF_FAILED;
|
if (__page_handle_poison(p) >= 0) {
|
||||||
if (__page_handle_poison(p)) {
|
|
||||||
page_ref_inc(p);
|
page_ref_inc(p);
|
||||||
res = MF_RECOVERED;
|
res = MF_RECOVERED;
|
||||||
|
} else {
|
||||||
|
res = MF_FAILED;
|
||||||
}
|
}
|
||||||
action_result(pfn, MF_MSG_FREE_HUGE, res);
|
action_result(pfn, MF_MSG_FREE_HUGE, res);
|
||||||
return res == MF_RECOVERED ? 0 : -EBUSY;
|
return res == MF_RECOVERED ? 0 : -EBUSY;
|
||||||
@@ -1771,21 +1884,6 @@ retry:
|
|||||||
|
|
||||||
page_flags = head->flags;
|
page_flags = head->flags;
|
||||||
|
|
||||||
/*
|
|
||||||
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
|
|
||||||
* simply disable it. In order to make it work properly, we need
|
|
||||||
* make sure that:
|
|
||||||
* - conversion of a pud that maps an error hugetlb into hwpoison
|
|
||||||
* entry properly works, and
|
|
||||||
* - other mm code walking over page table is aware of pud-aligned
|
|
||||||
* hwpoison entries.
|
|
||||||
*/
|
|
||||||
if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
|
|
||||||
action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
|
|
||||||
res = -EBUSY;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
|
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
|
||||||
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
|
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
|
||||||
res = -EBUSY;
|
res = -EBUSY;
|
||||||
@@ -1804,6 +1902,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
#endif /* CONFIG_HUGETLB_PAGE */
|
#endif /* CONFIG_HUGETLB_PAGE */
|
||||||
|
|
||||||
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
||||||
@@ -2209,6 +2311,7 @@ int unpoison_memory(unsigned long pfn)
|
|||||||
struct page *p;
|
struct page *p;
|
||||||
int ret = -EBUSY;
|
int ret = -EBUSY;
|
||||||
int freeit = 0;
|
int freeit = 0;
|
||||||
|
unsigned long count = 1;
|
||||||
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||||
DEFAULT_RATELIMIT_BURST);
|
DEFAULT_RATELIMIT_BURST);
|
||||||
|
|
||||||
@@ -2256,6 +2359,13 @@ int unpoison_memory(unsigned long pfn)
|
|||||||
|
|
||||||
ret = get_hwpoison_page(p, MF_UNPOISON);
|
ret = get_hwpoison_page(p, MF_UNPOISON);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
|
if (PageHuge(p)) {
|
||||||
|
count = free_raw_hwp_pages(page, false);
|
||||||
|
if (count == 0) {
|
||||||
|
ret = -EBUSY;
|
||||||
|
goto unlock_mutex;
|
||||||
|
}
|
||||||
|
}
|
||||||
ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
|
ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
|
||||||
} else if (ret < 0) {
|
} else if (ret < 0) {
|
||||||
if (ret == -EHWPOISON) {
|
if (ret == -EHWPOISON) {
|
||||||
@@ -2264,6 +2374,13 @@ int unpoison_memory(unsigned long pfn)
|
|||||||
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
|
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
|
||||||
pfn, &unpoison_rs);
|
pfn, &unpoison_rs);
|
||||||
} else {
|
} else {
|
||||||
|
if (PageHuge(p)) {
|
||||||
|
count = free_raw_hwp_pages(page, false);
|
||||||
|
if (count == 0) {
|
||||||
|
ret = -EBUSY;
|
||||||
|
goto unlock_mutex;
|
||||||
|
}
|
||||||
|
}
|
||||||
freeit = !!TestClearPageHWPoison(p);
|
freeit = !!TestClearPageHWPoison(p);
|
||||||
|
|
||||||
put_page(page);
|
put_page(page);
|
||||||
@@ -2276,7 +2393,7 @@ int unpoison_memory(unsigned long pfn)
|
|||||||
unlock_mutex:
|
unlock_mutex:
|
||||||
mutex_unlock(&mf_mutex);
|
mutex_unlock(&mf_mutex);
|
||||||
if (!ret || freeit) {
|
if (!ret || freeit) {
|
||||||
num_poisoned_pages_dec();
|
num_poisoned_pages_sub(count);
|
||||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||||
page_to_pfn(p), &unpoison_rs);
|
page_to_pfn(p), &unpoison_rs);
|
||||||
}
|
}
|
||||||
|
@@ -27,408 +27,9 @@
|
|||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include <linux/vmalloc.h>
|
#include <linux/vmalloc.h>
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
#include <linux/pgtable.h>
|
|
||||||
#include <linux/bootmem_info.h>
|
|
||||||
|
|
||||||
#include <asm/dma.h>
|
#include <asm/dma.h>
|
||||||
#include <asm/pgalloc.h>
|
#include <asm/pgalloc.h>
|
||||||
#include <asm/tlbflush.h>
|
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
|
||||||
/**
|
|
||||||
* struct vmemmap_remap_walk - walk vmemmap page table
|
|
||||||
*
|
|
||||||
* @remap_pte: called for each lowest-level entry (PTE).
|
|
||||||
* @nr_walked: the number of walked pte.
|
|
||||||
* @reuse_page: the page which is reused for the tail vmemmap pages.
|
|
||||||
* @reuse_addr: the virtual address of the @reuse_page page.
|
|
||||||
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
|
|
||||||
* or is mapped from.
|
|
||||||
*/
|
|
||||||
struct vmemmap_remap_walk {
|
|
||||||
void (*remap_pte)(pte_t *pte, unsigned long addr,
|
|
||||||
struct vmemmap_remap_walk *walk);
|
|
||||||
unsigned long nr_walked;
|
|
||||||
struct page *reuse_page;
|
|
||||||
unsigned long reuse_addr;
|
|
||||||
struct list_head *vmemmap_pages;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
|
|
||||||
{
|
|
||||||
pmd_t __pmd;
|
|
||||||
int i;
|
|
||||||
unsigned long addr = start;
|
|
||||||
struct page *page = pmd_page(*pmd);
|
|
||||||
pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
|
|
||||||
|
|
||||||
if (!pgtable)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
pmd_populate_kernel(&init_mm, &__pmd, pgtable);
|
|
||||||
|
|
||||||
for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
|
|
||||||
pte_t entry, *pte;
|
|
||||||
pgprot_t pgprot = PAGE_KERNEL;
|
|
||||||
|
|
||||||
entry = mk_pte(page + i, pgprot);
|
|
||||||
pte = pte_offset_kernel(&__pmd, addr);
|
|
||||||
set_pte_at(&init_mm, addr, pte, entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
spin_lock(&init_mm.page_table_lock);
|
|
||||||
if (likely(pmd_leaf(*pmd))) {
|
|
||||||
/*
|
|
||||||
* Higher order allocations from buddy allocator must be able to
|
|
||||||
* be treated as indepdenent small pages (as they can be freed
|
|
||||||
* individually).
|
|
||||||
*/
|
|
||||||
if (!PageReserved(page))
|
|
||||||
split_page(page, get_order(PMD_SIZE));
|
|
||||||
|
|
||||||
/* Make pte visible before pmd. See comment in pmd_install(). */
|
|
||||||
smp_wmb();
|
|
||||||
pmd_populate_kernel(&init_mm, pmd, pgtable);
|
|
||||||
flush_tlb_kernel_range(start, start + PMD_SIZE);
|
|
||||||
} else {
|
|
||||||
pte_free_kernel(&init_mm, pgtable);
|
|
||||||
}
|
|
||||||
spin_unlock(&init_mm.page_table_lock);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
|
|
||||||
{
|
|
||||||
int leaf;
|
|
||||||
|
|
||||||
spin_lock(&init_mm.page_table_lock);
|
|
||||||
leaf = pmd_leaf(*pmd);
|
|
||||||
spin_unlock(&init_mm.page_table_lock);
|
|
||||||
|
|
||||||
if (!leaf)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return __split_vmemmap_huge_pmd(pmd, start);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
|
|
||||||
unsigned long end,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
pte_t *pte = pte_offset_kernel(pmd, addr);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The reuse_page is found 'first' in table walk before we start
|
|
||||||
* remapping (which is calling @walk->remap_pte).
|
|
||||||
*/
|
|
||||||
if (!walk->reuse_page) {
|
|
||||||
walk->reuse_page = pte_page(*pte);
|
|
||||||
/*
|
|
||||||
* Because the reuse address is part of the range that we are
|
|
||||||
* walking, skip the reuse address range.
|
|
||||||
*/
|
|
||||||
addr += PAGE_SIZE;
|
|
||||||
pte++;
|
|
||||||
walk->nr_walked++;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (; addr != end; addr += PAGE_SIZE, pte++) {
|
|
||||||
walk->remap_pte(pte, addr, walk);
|
|
||||||
walk->nr_walked++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
|
|
||||||
unsigned long end,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
pmd_t *pmd;
|
|
||||||
unsigned long next;
|
|
||||||
|
|
||||||
pmd = pmd_offset(pud, addr);
|
|
||||||
do {
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
next = pmd_addr_end(addr, end);
|
|
||||||
vmemmap_pte_range(pmd, addr, next, walk);
|
|
||||||
} while (pmd++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
|
|
||||||
unsigned long end,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
pud_t *pud;
|
|
||||||
unsigned long next;
|
|
||||||
|
|
||||||
pud = pud_offset(p4d, addr);
|
|
||||||
do {
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
next = pud_addr_end(addr, end);
|
|
||||||
ret = vmemmap_pmd_range(pud, addr, next, walk);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
} while (pud++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
|
|
||||||
unsigned long end,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
p4d_t *p4d;
|
|
||||||
unsigned long next;
|
|
||||||
|
|
||||||
p4d = p4d_offset(pgd, addr);
|
|
||||||
do {
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
next = p4d_addr_end(addr, end);
|
|
||||||
ret = vmemmap_pud_range(p4d, addr, next, walk);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
} while (p4d++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int vmemmap_remap_range(unsigned long start, unsigned long end,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
unsigned long addr = start;
|
|
||||||
unsigned long next;
|
|
||||||
pgd_t *pgd;
|
|
||||||
|
|
||||||
VM_BUG_ON(!PAGE_ALIGNED(start));
|
|
||||||
VM_BUG_ON(!PAGE_ALIGNED(end));
|
|
||||||
|
|
||||||
pgd = pgd_offset_k(addr);
|
|
||||||
do {
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
next = pgd_addr_end(addr, end);
|
|
||||||
ret = vmemmap_p4d_range(pgd, addr, next, walk);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
} while (pgd++, addr = next, addr != end);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We only change the mapping of the vmemmap virtual address range
|
|
||||||
* [@start + PAGE_SIZE, end), so we only need to flush the TLB which
|
|
||||||
* belongs to the range.
|
|
||||||
*/
|
|
||||||
flush_tlb_kernel_range(start + PAGE_SIZE, end);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
|
|
||||||
* allocator or buddy allocator. If the PG_reserved flag is set, it means
|
|
||||||
* that it allocated from the memblock allocator, just free it via the
|
|
||||||
* free_bootmem_page(). Otherwise, use __free_page().
|
|
||||||
*/
|
|
||||||
static inline void free_vmemmap_page(struct page *page)
|
|
||||||
{
|
|
||||||
if (PageReserved(page))
|
|
||||||
free_bootmem_page(page);
|
|
||||||
else
|
|
||||||
__free_page(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Free a list of the vmemmap pages */
|
|
||||||
static void free_vmemmap_page_list(struct list_head *list)
|
|
||||||
{
|
|
||||||
struct page *page, *next;
|
|
||||||
|
|
||||||
list_for_each_entry_safe(page, next, list, lru) {
|
|
||||||
list_del(&page->lru);
|
|
||||||
free_vmemmap_page(page);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Remap the tail pages as read-only to catch illegal write operation
|
|
||||||
* to the tail pages.
|
|
||||||
*/
|
|
||||||
pgprot_t pgprot = PAGE_KERNEL_RO;
|
|
||||||
pte_t entry = mk_pte(walk->reuse_page, pgprot);
|
|
||||||
struct page *page = pte_page(*pte);
|
|
||||||
|
|
||||||
list_add_tail(&page->lru, walk->vmemmap_pages);
|
|
||||||
set_pte_at(&init_mm, addr, pte, entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* How many struct page structs need to be reset. When we reuse the head
|
|
||||||
* struct page, the special metadata (e.g. page->flags or page->mapping)
|
|
||||||
* cannot copy to the tail struct page structs. The invalid value will be
|
|
||||||
* checked in the free_tail_pages_check(). In order to avoid the message
|
|
||||||
* of "corrupted mapping in tail page". We need to reset at least 3 (one
|
|
||||||
* head struct page struct and two tail struct page structs) struct page
|
|
||||||
* structs.
|
|
||||||
*/
|
|
||||||
#define NR_RESET_STRUCT_PAGE 3
|
|
||||||
|
|
||||||
static inline void reset_struct_pages(struct page *start)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
struct page *from = start + NR_RESET_STRUCT_PAGE;
|
|
||||||
|
|
||||||
for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
|
|
||||||
memcpy(start + i, from, sizeof(*from));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
|
|
||||||
struct vmemmap_remap_walk *walk)
|
|
||||||
{
|
|
||||||
pgprot_t pgprot = PAGE_KERNEL;
|
|
||||||
struct page *page;
|
|
||||||
void *to;
|
|
||||||
|
|
||||||
BUG_ON(pte_page(*pte) != walk->reuse_page);
|
|
||||||
|
|
||||||
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
|
|
||||||
list_del(&page->lru);
|
|
||||||
to = page_to_virt(page);
|
|
||||||
copy_page(to, (void *)walk->reuse_addr);
|
|
||||||
reset_struct_pages(to);
|
|
||||||
|
|
||||||
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
|
|
||||||
* to the page which @reuse is mapped to, then free vmemmap
|
|
||||||
* which the range are mapped to.
|
|
||||||
* @start: start address of the vmemmap virtual address range that we want
|
|
||||||
* to remap.
|
|
||||||
* @end: end address of the vmemmap virtual address range that we want to
|
|
||||||
* remap.
|
|
||||||
* @reuse: reuse address.
|
|
||||||
*
|
|
||||||
* Return: %0 on success, negative error code otherwise.
|
|
||||||
*/
|
|
||||||
int vmemmap_remap_free(unsigned long start, unsigned long end,
|
|
||||||
unsigned long reuse)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
LIST_HEAD(vmemmap_pages);
|
|
||||||
struct vmemmap_remap_walk walk = {
|
|
||||||
.remap_pte = vmemmap_remap_pte,
|
|
||||||
.reuse_addr = reuse,
|
|
||||||
.vmemmap_pages = &vmemmap_pages,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In order to make remapping routine most efficient for the huge pages,
|
|
||||||
* the routine of vmemmap page table walking has the following rules
|
|
||||||
* (see more details from the vmemmap_pte_range()):
|
|
||||||
*
|
|
||||||
* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
|
|
||||||
* should be continuous.
|
|
||||||
* - The @reuse address is part of the range [@reuse, @end) that we are
|
|
||||||
* walking which is passed to vmemmap_remap_range().
|
|
||||||
* - The @reuse address is the first in the complete range.
|
|
||||||
*
|
|
||||||
* So we need to make sure that @start and @reuse meet the above rules.
|
|
||||||
*/
|
|
||||||
BUG_ON(start - reuse != PAGE_SIZE);
|
|
||||||
|
|
||||||
mmap_read_lock(&init_mm);
|
|
||||||
ret = vmemmap_remap_range(reuse, end, &walk);
|
|
||||||
if (ret && walk.nr_walked) {
|
|
||||||
end = reuse + walk.nr_walked * PAGE_SIZE;
|
|
||||||
/*
|
|
||||||
* vmemmap_pages contains pages from the previous
|
|
||||||
* vmemmap_remap_range call which failed. These
|
|
||||||
* are pages which were removed from the vmemmap.
|
|
||||||
* They will be restored in the following call.
|
|
||||||
*/
|
|
||||||
walk = (struct vmemmap_remap_walk) {
|
|
||||||
.remap_pte = vmemmap_restore_pte,
|
|
||||||
.reuse_addr = reuse,
|
|
||||||
.vmemmap_pages = &vmemmap_pages,
|
|
||||||
};
|
|
||||||
|
|
||||||
vmemmap_remap_range(reuse, end, &walk);
|
|
||||||
}
|
|
||||||
mmap_read_unlock(&init_mm);
|
|
||||||
|
|
||||||
free_vmemmap_page_list(&vmemmap_pages);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
|
|
||||||
gfp_t gfp_mask, struct list_head *list)
|
|
||||||
{
|
|
||||||
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
|
|
||||||
int nid = page_to_nid((struct page *)start);
|
|
||||||
struct page *page, *next;
|
|
||||||
|
|
||||||
while (nr_pages--) {
|
|
||||||
page = alloc_pages_node(nid, gfp_mask, 0);
|
|
||||||
if (!page)
|
|
||||||
goto out;
|
|
||||||
list_add_tail(&page->lru, list);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
out:
|
|
||||||
list_for_each_entry_safe(page, next, list, lru)
|
|
||||||
__free_pages(page, 0);
|
|
||||||
return -ENOMEM;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
|
|
||||||
* to the page which is from the @vmemmap_pages
|
|
||||||
* respectively.
|
|
||||||
* @start: start address of the vmemmap virtual address range that we want
|
|
||||||
* to remap.
|
|
||||||
* @end: end address of the vmemmap virtual address range that we want to
|
|
||||||
* remap.
|
|
||||||
* @reuse: reuse address.
|
|
||||||
* @gfp_mask: GFP flag for allocating vmemmap pages.
|
|
||||||
*
|
|
||||||
* Return: %0 on success, negative error code otherwise.
|
|
||||||
*/
|
|
||||||
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
|
|
||||||
unsigned long reuse, gfp_t gfp_mask)
|
|
||||||
{
|
|
||||||
LIST_HEAD(vmemmap_pages);
|
|
||||||
struct vmemmap_remap_walk walk = {
|
|
||||||
.remap_pte = vmemmap_restore_pte,
|
|
||||||
.reuse_addr = reuse,
|
|
||||||
.vmemmap_pages = &vmemmap_pages,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* See the comment in the vmemmap_remap_free(). */
|
|
||||||
BUG_ON(start - reuse != PAGE_SIZE);
|
|
||||||
|
|
||||||
if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
mmap_read_lock(&init_mm);
|
|
||||||
vmemmap_remap_range(reuse, end, &walk);
|
|
||||||
mmap_read_unlock(&init_mm);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Allocate a block of memory to be used to back the virtual memory map
|
* Allocate a block of memory to be used to back the virtual memory map
|
||||||
|
Reference in New Issue
Block a user