Merge branch 'for-5.7/appleir' into for-linus

- small code cleanups in hid-appleir from Lucas Tanure
This commit is contained in:
Jiri Kosina
2020-04-01 12:26:12 +02:00
10698 changed files with 530460 additions and 226431 deletions

4
.gitignore vendored
View File

@@ -100,6 +100,10 @@ modules.order
/include/ksym/ /include/ksym/
/arch/*/include/generated/ /arch/*/include/generated/
# Generated lkdtm tests
/tools/testing/selftests/lkdtm/*.sh
!/tools/testing/selftests/lkdtm/run.sh
# stgit generated dirs # stgit generated dirs
patches-* patches-*

View File

@@ -18,6 +18,7 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com>
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com> Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com> Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org> Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com>
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com> Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com> Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
Alexei Starovoitov <ast@kernel.org> <ast@fb.com> Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
@@ -27,6 +28,8 @@ Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
Andreas Herrmann <aherrman@de.ibm.com> Andreas Herrmann <aherrman@de.ibm.com>
Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com> Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
Andrew Morton <akpm@linux-foundation.org> Andrew Morton <akpm@linux-foundation.org>
Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
Andrew Vasquez <andrew.vasquez@qlogic.com> Andrew Vasquez <andrew.vasquez@qlogic.com>
Andy Adamson <andros@citi.umich.edu> Andy Adamson <andros@citi.umich.edu>
Antoine Tenart <antoine.tenart@free-electrons.com> Antoine Tenart <antoine.tenart@free-electrons.com>
@@ -74,6 +77,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> <dima@arista.com>
Domen Puncer <domen@coderock.org> Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net> Douglas Gilbert <dougg@torque.net>
Ed L. Cashin <ecashin@coraid.com> Ed L. Cashin <ecashin@coraid.com>
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
Evgeniy Polyakov <johnpol@2ka.mipt.ru> Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Felipe W Damasio <felipewd@terra.com.br> Felipe W Damasio <felipewd@terra.com.br>
Felix Kuhling <fxkuehl@gmx.de> Felix Kuhling <fxkuehl@gmx.de>
@@ -138,6 +142,7 @@ Juha Yrjola <at solidboot.com>
Juha Yrjola <juha.yrjola@nokia.com> Juha Yrjola <juha.yrjola@nokia.com>
Juha Yrjola <juha.yrjola@solidboot.com> Juha Yrjola <juha.yrjola@solidboot.com>
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com> Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
Kay Sievers <kay.sievers@vrfy.org> Kay Sievers <kay.sievers@vrfy.org>
Kenneth W Chen <kenneth.w.chen@intel.com> Kenneth W Chen <kenneth.w.chen@intel.com>
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com> Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
@@ -209,6 +214,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Patrick Mochel <mochel@digitalimplant.org> Patrick Mochel <mochel@digitalimplant.org>
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com> Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com> Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
Peter A Jonsson <pj@ludd.ltu.se> Peter A Jonsson <pj@ludd.ltu.se>
Peter Oruba <peter@oruba.de> Peter Oruba <peter@oruba.de>
Peter Oruba <peter.oruba@amd.com> Peter Oruba <peter.oruba@amd.com>
@@ -217,6 +226,7 @@ Praveen BP <praveenbp@ti.com>
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com> Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com> Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com> Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
Rajesh Shah <rajesh.shah@intel.com> Rajesh Shah <rajesh.shah@intel.com>
Ralf Baechle <ralf@linux-mips.org> Ralf Baechle <ralf@linux-mips.org>
Ralf Wildenhues <Ralf.Wildenhues@gmx.de> Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
@@ -252,6 +262,7 @@ Sumit Semwal <sumit.semwal@ti.com>
Tejun Heo <htejun@gmail.com> Tejun Heo <htejun@gmail.com>
Thomas Graf <tgraf@suug.ch> Thomas Graf <tgraf@suug.ch>
Thomas Pedersen <twp@codeaurora.org> Thomas Pedersen <twp@codeaurora.org>
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org> Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org>
Tony Luck <tony.luck@intel.com> Tony Luck <tony.luck@intel.com>
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn> TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>

View File

@@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see:
Documentation/process/license-rules.rst Documentation/process/license-rules.rst
for more details. for more details.
All contributions to the Linux Kernel are subject to this COPYING file.

View File

@@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem
S: Orlando, Florida S: Orlando, Florida
S: USA S: USA
N: Paul Burton
E: paulburton@kernel.org
W: https://pburton.com
D: MIPS maintainer 2018-2020
N: Lennert Buytenhek N: Lennert Buytenhek
E: kernel@wantstofly.org E: kernel@wantstofly.org
D: Original (2.4) rewrite of the ethernet bridging code D: Original (2.4) rewrite of the ethernet bridging code
@@ -3302,7 +3307,9 @@ S: France
N: Aleksa Sarai N: Aleksa Sarai
E: cyphar@cyphar.com E: cyphar@cyphar.com
W: https://www.cyphar.com/ W: https://www.cyphar.com/
D: `pids` cgroup subsystem D: /sys/fs/cgroup/pids
D: openat2(2)
S: Sydney, Australia
N: Dipankar Sarma N: Dipankar Sarma
E: dipankar@in.ibm.com E: dipankar@in.ibm.com

View File

@@ -0,0 +1,26 @@
What: /sys/fs/selinux/disable
Date: April 2005 (predates git)
KernelVersion: 2.6.12-rc2 (predates git)
Contact: selinux@vger.kernel.org
Description:
The selinuxfs "disable" node allows SELinux to be disabled at runtime
prior to a policy being loaded into the kernel. If disabled via this
mechanism, SELinux will remain disabled until the system is rebooted.
The preferred method of disabling SELinux is via the "selinux=0" boot
parameter, but the selinuxfs "disable" node was created to make it
easier for systems with primitive bootloaders that did not allow for
easy modification of the kernel command line. Unfortunately, allowing
for SELinux to be disabled at runtime makes it difficult to secure the
kernel's LSM hooks using the "__ro_after_init" feature.
Thankfully, the need for the SELinux runtime disable appears to be
gone, the default Kconfig configuration disables this selinuxfs node,
and only one of the major distributions, Fedora, supports disabling
SELinux at runtime. Fedora is in the process of removing the
selinuxfs "disable" node and once that is complete we will start the
slow process of removing this code from the kernel.
More information on /sys/fs/selinux/disable can be found under the
CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.

View File

@@ -0,0 +1,171 @@
What: sys/bus/dsa/devices/dsa<m>/cdev_major
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The major number that the character device driver assigned to
this device.
What: sys/bus/dsa/devices/dsa<m>/errors
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The error information for this device.
What: sys/bus/dsa/devices/dsa<m>/max_batch_size
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The largest number of work descriptors in a batch.
What: sys/bus/dsa/devices/dsa<m>/max_work_queues_size
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum work queue size supported by this device.
What: sys/bus/dsa/devices/dsa<m>/max_engines
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum number of engines supported by this device.
What: sys/bus/dsa/devices/dsa<m>/max_groups
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum number of groups can be created under this device.
What: sys/bus/dsa/devices/dsa<m>/max_tokens
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The total number of bandwidth tokens supported by this device.
The bandwidth tokens represent resources within the DSA
implementation, and these resources are allocated by engines to
support operations.
What: sys/bus/dsa/devices/dsa<m>/max_transfer_size
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The number of bytes to be read from the source address to
perform the operation. The maximum transfer size is dependent on
the workqueue the descriptor was submitted to.
What: sys/bus/dsa/devices/dsa<m>/max_work_queues
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum work queue number that this device supports.
What: sys/bus/dsa/devices/dsa<m>/numa_node
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The numa node number for this device.
What: sys/bus/dsa/devices/dsa<m>/op_cap
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The operation capability bit mask specify the operation types
supported by the this device.
What: sys/bus/dsa/devices/dsa<m>/state
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The state information of this device. It can be either enabled
or disabled.
What: sys/bus/dsa/devices/dsa<m>/group<m>.<n>
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The assigned group under this device.
What: sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The assigned engine under this device.
What: sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The assigned work queue under this device.
What: sys/bus/dsa/devices/dsa<m>/configurable
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: To indicate if this device is configurable or not.
What: sys/bus/dsa/devices/dsa<m>/token_limit
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum number of bandwidth tokens that may be in use at
one time by operations that access low bandwidth memory in the
device.
What: sys/bus/dsa/devices/wq<m>.<n>/group_id
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The group id that this work queue belongs to.
What: sys/bus/dsa/devices/wq<m>.<n>/size
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The work queue size for this work queue.
What: sys/bus/dsa/devices/wq<m>.<n>/type
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The type of this work queue, it can be "kernel" type for work
queue usages in the kernel space or "user" type for work queue
usages by applications in user space.
What: sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The minor number assigned to this work queue by the character
device driver.
What: sys/bus/dsa/devices/wq<m>.<n>/mode
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The work queue mode type for this work queue.
What: sys/bus/dsa/devices/wq<m>.<n>/priority
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The priority value of this work queue, it is a vlue relative to
other work queue in the same group to control quality of service
for dispatching work from multiple workqueues in the same group.
What: sys/bus/dsa/devices/wq<m>.<n>/state
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The current state of the work queue.
What: sys/bus/dsa/devices/wq<m>.<n>/threshold
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The number of entries in this work queue that may be filled
via a limited portal.
What: sys/bus/dsa/devices/engine<m>.<n>/group_id
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The group that this engine belongs to.

View File

@@ -16,6 +16,10 @@ Description:
write UDC's name found in /sys/class/udc/* write UDC's name found in /sys/class/udc/*
to bind a gadget, empty string "" to unbind. to bind a gadget, empty string "" to unbind.
max_speed - maximum speed the driver supports. Valid
names are super-speed-plus, super-speed,
high-speed, full-speed, and low-speed.
bDeviceClass - USB device class code bDeviceClass - USB device class code
bDeviceSubClass - USB device subclass code bDeviceSubClass - USB device subclass code
bDeviceProtocol - USB device protocol code bDeviceProtocol - USB device protocol code

View File

@@ -25,11 +25,11 @@ Description:
lsm: [[subj_user=] [subj_role=] [subj_type=] lsm: [[subj_user=] [subj_role=] [subj_type=]
[obj_user=] [obj_role=] [obj_type=]] [obj_user=] [obj_role=] [obj_type=]]
option: [[appraise_type=]] [template=] [permit_directio] option: [[appraise_type=]] [template=] [permit_directio]
[appraise_flag=] [appraise_flag=] [keyrings=]
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK] [FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
[KEXEC_CMDLINE] [KEXEC_CMDLINE] [KEY_CHECK]
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND] mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
[[^]MAY_EXEC] [[^]MAY_EXEC]
fsmagic:= hex value fsmagic:= hex value
@@ -42,6 +42,9 @@ Description:
appraise_flag:= [check_blacklist] appraise_flag:= [check_blacklist]
Currently, blacklist check is only for files signed with appended Currently, blacklist check is only for files signed with appended
signature. signature.
keyrings:= list of keyrings
(eg, .builtin_trusted_keys|.ima). Only valid
when action is "measure" and func is KEY_CHECK.
template:= name of a defined IMA template type template:= name of a defined IMA template type
(eg, ima-ng). Only valid when action is "measure". (eg, ima-ng). Only valid when action is "measure".
pcr:= decimal value pcr:= decimal value
@@ -113,3 +116,12 @@ Description:
Example of appraise rule allowing modsig appended signatures: Example of appraise rule allowing modsig appended signatures:
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
Example of measure rule using KEY_CHECK to measure all keys:
measure func=KEY_CHECK
Example of measure rule using KEY_CHECK to only measure
keys added to .builtin_trusted_keys or .ima keyring:
measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima

View File

@@ -33,6 +33,14 @@ Description:
Requires a separate RTC_PIE_ON call to enable the periodic Requires a separate RTC_PIE_ON call to enable the periodic
interrupts. interrupts.
* RTC_VL_READ: Read the voltage inputs status of the RTC when
supported. The value is a bit field of RTC_VL_*, giving the
status of the main and backup voltages.
* RTC_VL_CLEAR: Clear the voltage status of the RTC. Some RTCs
need user interaction when the backup power provider is
replaced or charged to be able to clear the status.
The ioctl() calls supported by the older /dev/rtc interface are The ioctl() calls supported by the older /dev/rtc interface are
also supported by the newer RTC class framework. However, also supported by the newer RTC class framework. However,
because the chips and systems are not standardized, some PC/AT because the chips and systems are not standardized, some PC/AT

View File

@@ -1726,3 +1726,16 @@ Contact: linux-iio@vger.kernel.org
Description: Description:
List of valid periods (in seconds) for which the light intensity List of valid periods (in seconds) for which the light intensity
must be above the threshold level before interrupt is asserted. must be above the threshold level before interrupt is asserted.
What: /sys/bus/iio/devices/iio:deviceX/in_filter_notch_center_frequency
KernelVersion: 5.5
Contact: linux-iio@vger.kernel.org
Description:
Center frequency in Hz for a notch filter. Used i.e. for line
noise suppression.
What: /sys/bus/iio/devices/iio:deviceX/in_temp_thermocouple_type
KernelVersion: 5.5
Contact: linux-iio@vger.kernel.org
Description:
One of the following thermocouple types: B, E, J, K, N, R, S, T.

View File

@@ -0,0 +1,19 @@
What: /sys/bus/iio/devices/iio:deviceX/buffer/length_align_bytes
KernelVersion: 5.4
Contact: linux-iio@vger.kernel.org
Description:
DMA buffers tend to have a alignment requirement for the
buffers. If this alignment requirement is not met samples might
be dropped from the buffer.
This property reports the alignment requirements in bytes.
This means that the buffer size in bytes needs to be a integer
multiple of the number reported by this file.
The alignment requirements in number of sample sets will depend
on the enabled channels and the bytes per channel. This means
that the alignment requirement in samples sets might change
depending on which and how many channels are enabled. Whereas
the alignment requirement reported in bytes by this property
will remain static and does not depend on which channels are
enabled.

View File

@@ -0,0 +1,63 @@
What: /sys/bus/mdio_bus/devices/.../statistics/
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
This folder contains statistics about global and per
MDIO bus address statistics.
What: /sys/bus/mdio_bus/devices/.../statistics/transfers
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of transfers for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/errors
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of transfer errors for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/writes
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of write transactions for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/reads
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of read transactions for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of transfers for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of transfer errors for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of write transactions for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
Description:
Total number of read transactions for this MDIO bus address.

View File

@@ -7,6 +7,13 @@ Description:
The name of devfreq object denoted as ... is same as the The name of devfreq object denoted as ... is same as the
name of device using devfreq. name of device using devfreq.
What: /sys/class/devfreq/.../name
Date: November 2019
Contact: Chanwoo Choi <cw00.choi@samsung.com>
Description:
The /sys/class/devfreq/.../name shows the name of device
of the corresponding devfreq object.
What: /sys/class/devfreq/.../governor What: /sys/class/devfreq/.../governor
Date: September 2011 Date: September 2011
Contact: MyungJoo Ham <myungjoo.ham@samsung.com> Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
@@ -48,12 +55,15 @@ What: /sys/class/devfreq/.../trans_stat
Date: October 2012 Date: October 2012
Contact: MyungJoo Ham <myungjoo.ham@samsung.com> Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
Description: Description:
This ABI shows the statistics of devfreq behavior on a This ABI shows or clears the statistics of devfreq behavior
specific device. It shows the time spent in each state and on a specific device. It shows the time spent in each state
the number of transitions between states. and the number of transitions between states.
In order to activate this ABI, the devfreq target device In order to activate this ABI, the devfreq target device
driver should provide the list of available frequencies driver should provide the list of available frequencies
with its profile. with its profile. If need to reset the statistics of devfreq
behavior on a specific device, enter 0(zero) to 'trans_stat'
as following:
echo 0 > /sys/class/devfreq/.../trans_stat
What: /sys/class/devfreq/.../userspace/set_freq What: /sys/class/devfreq/.../userspace/set_freq
Date: September 2011 Date: September 2011

View File

@@ -189,7 +189,8 @@ Description:
Access: Read Access: Read
Valid values: "Unknown", "Good", "Overheat", "Dead", Valid values: "Unknown", "Good", "Overheat", "Dead",
"Over voltage", "Unspecified failure", "Cold", "Over voltage", "Unspecified failure", "Cold",
"Watchdog timer expire", "Safety timer expire" "Watchdog timer expire", "Safety timer expire",
"Over current"
What: /sys/class/power_supply/<supply_name>/precharge_current What: /sys/class/power_supply/<supply_name>/precharge_current
Date: June 2017 Date: June 2017

View File

@@ -196,6 +196,12 @@ Description:
does not reflect it. Likewise, if one enables a deep state but a does not reflect it. Likewise, if one enables a deep state but a
lighter state still is disabled, then this has no effect. lighter state still is disabled, then this has no effect.
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
Date: December 2019
KernelVersion: v5.6
Contact: Linux power management list <linux-pm@vger.kernel.org>
Description:
(RO) The default status of this state, "enabled" or "disabled".
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
Date: March 2014 Date: March 2014

View File

@@ -11,3 +11,16 @@ Description:
#echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks #echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks
will allow the guest to read and write to the configuration will allow the guest to read and write to the configuration
register 0x0E. register 0x0E.
What: /sys/bus/pci/drivers/pciback/allow_interrupt_control
Date: Jan 2020
KernelVersion: 5.6
Contact: xen-devel@lists.xenproject.org
Description:
List of devices which can have interrupt control flag (INTx,
MSI, MSI-X) set by a connected guest. It is meant to be set
only when the guest is a stubdomain hosting device model (qemu)
and the actual device is assigned to a HVM. It is not safe
(similar to permissive attribute) to set for a devices assigned
to a PV guest. The device is automatically removed from this
list when the connected pcifront terminates.

View File

@@ -25,3 +25,13 @@ Description:
allocated without being in use. The time is in allocated without being in use. The time is in
seconds, 0 means indefinitely long. seconds, 0 means indefinitely long.
The default is 60 seconds. The default is 60 seconds.
What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
Date: December 2019
KernelVersion: 5.6
Contact: SeongJae Park <sjpark@amazon.de>
Description:
When memory pressure is reported to blkback this option
controls the duration in milliseconds that blkback will not
cache any page not backed by a grant mapping.
The default is 10ms.

View File

@@ -1,37 +1,40 @@
What: /sys/fs/f2fs/<disk>/gc_max_sleep_time What: /sys/fs/f2fs/<disk>/gc_max_sleep_time
Date: July 2013 Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com> Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
Description: Description: Controls the maximum sleep time for gc_thread. Time
Controls the maximun sleep time for gc_thread. Time is in milliseconds.
is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_min_sleep_time What: /sys/fs/f2fs/<disk>/gc_min_sleep_time
Date: July 2013 Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com> Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
Description: Description: Controls the minimum sleep time for gc_thread. Time
Controls the minimum sleep time for gc_thread. Time is in milliseconds.
is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time
Date: July 2013 Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com> Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
Description: Description: Controls the default sleep time for gc_thread. Time
Controls the default sleep time for gc_thread. Time is in milliseconds.
is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_idle What: /sys/fs/f2fs/<disk>/gc_idle
Date: July 2013 Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com> Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
Description: Description: Controls the victim selection policy for garbage collection.
Controls the victim selection policy for garbage collection. Setting gc_idle = 0(default) will disable this option. Setting
gc_idle = 1 will select the Cost Benefit approach & setting
gc_idle = 2 will select the greedy approach.
What: /sys/fs/f2fs/<disk>/reclaim_segments What: /sys/fs/f2fs/<disk>/reclaim_segments
Date: October 2013 Date: October 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: This parameter controls the number of prefree segments to be
Controls the issue rate of segment discard commands. reclaimed. If the number of prefree segments is larger than
the number of segments in the proportion to the percentage
over total volume size, f2fs tries to conduct checkpoint to
reclaim the prefree segments to free segments.
By default, 5% over total # of segments.
What: /sys/fs/f2fs/<disk>/max_blkaddr What: /sys/fs/f2fs/<disk>/main_blkaddr
Date: November 2019 Date: November 2019
Contact: "Ramon Pantin" <pantin@google.com> Contact: "Ramon Pantin" <pantin@google.com>
Description: Description:
@@ -40,227 +43,278 @@ Description:
What: /sys/fs/f2fs/<disk>/ipu_policy What: /sys/fs/f2fs/<disk>/ipu_policy
Date: November 2013 Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the in-place-update policy.
Controls the in-place-update policy. updates in f2fs. User can set:
0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
0x40: F2FS_IPU_NOCACHE.
Refer segment.h for details.
What: /sys/fs/f2fs/<disk>/min_ipu_util What: /sys/fs/f2fs/<disk>/min_ipu_util
Date: November 2013 Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the FS utilization condition for the in-place-update
Controls the FS utilization condition for the in-place-update policies. It is used by F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
policies.
What: /sys/fs/f2fs/<disk>/min_fsync_blocks What: /sys/fs/f2fs/<disk>/min_fsync_blocks
Date: September 2014 Date: September 2014
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the dirty page count condition for the in-place-update
Controls the dirty page count condition for the in-place-update policies.
policies.
What: /sys/fs/f2fs/<disk>/min_seq_blocks What: /sys/fs/f2fs/<disk>/min_seq_blocks
Date: August 2018 Date: August 2018
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the dirty page count condition for batched sequential
Controls the dirty page count condition for batched sequential writes in writepages.
writes in ->writepages.
What: /sys/fs/f2fs/<disk>/min_hot_blocks What: /sys/fs/f2fs/<disk>/min_hot_blocks
Date: March 2017 Date: March 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the dirty page count condition for redefining hot data.
Controls the dirty page count condition for redefining hot data.
What: /sys/fs/f2fs/<disk>/min_ssr_sections What: /sys/fs/f2fs/<disk>/min_ssr_sections
Date: October 2017 Date: October 2017
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Controls the free section threshold to trigger SSR allocation.
Controls the fee section threshold to trigger SSR allocation. If this is large, SSR mode will be enabled early.
What: /sys/fs/f2fs/<disk>/max_small_discards What: /sys/fs/f2fs/<disk>/max_small_discards
Date: November 2013 Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the issue rate of discard commands that consist of small
Controls the issue rate of small discard commands. blocks less than 2MB. The candidates to be discarded are cached until
checkpoint is triggered, and issued during the checkpoint.
By default, it is disabled with 0.
What: /sys/fs/f2fs/<disk>/discard_granularity What: /sys/fs/f2fs/<disk>/discard_granularity
Date: July 2017 Date: July 2017
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Controls discard granularity of inner discard thread. Inner thread
Controls discard granularity of inner discard thread, inner thread
will not issue discards with size that is smaller than granularity. will not issue discards with size that is smaller than granularity.
The unit size is one block, now only support configuring in range The unit size is one block(4KB), now only support configuring
of [1, 512]. in range of [1, 512]. Default value is 4(=16KB).
What: /sys/fs/f2fs/<disk>/umount_discard_timeout What: /sys/fs/f2fs/<disk>/umount_discard_timeout
Date: January 2019 Date: January 2019
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Set timeout to issue discard commands during umount.
Set timeout to issue discard commands during umount. Default: 5 secs
Default: 5 secs
What: /sys/fs/f2fs/<disk>/max_victim_search What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014 Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the number of trials to find a victim segment
Controls the number of trials to find a victim segment. when conducting SSR and cleaning operations. The default value
is 4096 which covers 8GB block address range.
What: /sys/fs/f2fs/<disk>/migration_granularity What: /sys/fs/f2fs/<disk>/migration_granularity
Date: October 2018 Date: October 2018
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Controls migration granularity of garbage collection on large
Controls migration granularity of garbage collection on large section, it can let GC move partial segment{s} of one section
section, it can let GC move partial segment{s} of one section in one GC cycle, so that dispersing heavy overhead GC to
in one GC cycle, so that dispersing heavy overhead GC to multiple lightweight one.
multiple lightweight one.
What: /sys/fs/f2fs/<disk>/dir_level What: /sys/fs/f2fs/<disk>/dir_level
Date: March 2014 Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the directory level for large directory. If a
Controls the directory level for large directory. directory has a number of files, it can reduce the file lookup
latency by increasing this dir_level value. Otherwise, it
needs to decrease this value to reduce the space overhead.
The default value is 0.
What: /sys/fs/f2fs/<disk>/ram_thresh What: /sys/fs/f2fs/<disk>/ram_thresh
Date: March 2014 Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Description: Controls the memory footprint used by free nids and cached
Controls the memory footprint used by f2fs. nat entries. By default, 1 is set, which indicates
10 MB / 1 GB RAM.
What: /sys/fs/f2fs/<disk>/batched_trim_sections What: /sys/fs/f2fs/<disk>/batched_trim_sections
Date: February 2015 Date: February 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the trimming rate in batch mode.
Controls the trimming rate in batch mode. <deprecated>
<deprecated>
What: /sys/fs/f2fs/<disk>/cp_interval What: /sys/fs/f2fs/<disk>/cp_interval
Date: October 2015 Date: October 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the checkpoint timing, set to 60 seconds by default.
Controls the checkpoint timing.
What: /sys/fs/f2fs/<disk>/idle_interval What: /sys/fs/f2fs/<disk>/idle_interval
Date: January 2016 Date: January 2016
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls the idle timing of system, if there is no FS operation
Controls the idle timing for all paths other than during given interval.
discard and gc path. Set to 5 seconds by default.
What: /sys/fs/f2fs/<disk>/discard_idle_interval What: /sys/fs/f2fs/<disk>/discard_idle_interval
Date: September 2018 Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org> Contact: "Sahitya Tummala" <stummala@codeaurora.org>
Description: Description: Controls the idle timing of discard thread given
Controls the idle timing for discard path. this time interval.
Default is 5 secs.
What: /sys/fs/f2fs/<disk>/gc_idle_interval What: /sys/fs/f2fs/<disk>/gc_idle_interval
Date: September 2018 Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org> Contact: "Sahitya Tummala" <stummala@codeaurora.org>
Description: Description: Controls the idle timing for gc path. Set to 5 seconds by default.
Controls the idle timing for gc path.
What: /sys/fs/f2fs/<disk>/iostat_enable What: /sys/fs/f2fs/<disk>/iostat_enable
Date: August 2017 Date: August 2017
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Controls to enable/disable IO stat.
Controls to enable/disable IO stat.
What: /sys/fs/f2fs/<disk>/ra_nid_pages What: /sys/fs/f2fs/<disk>/ra_nid_pages
Date: October 2015 Date: October 2015
Contact: "Chao Yu" <chao2.yu@samsung.com> Contact: "Chao Yu" <chao2.yu@samsung.com>
Description: Description: Controls the count of nid pages to be readaheaded.
Controls the count of nid pages to be readaheaded. When building free nids, F2FS reads NAT blocks ahead for
speed up. Default is 0.
What: /sys/fs/f2fs/<disk>/dirty_nats_ratio What: /sys/fs/f2fs/<disk>/dirty_nats_ratio
Date: January 2016 Date: January 2016
Contact: "Chao Yu" <chao2.yu@samsung.com> Contact: "Chao Yu" <chao2.yu@samsung.com>
Description: Description: Controls dirty nat entries ratio threshold, if current
Controls dirty nat entries ratio threshold, if current ratio exceeds configured threshold, checkpoint will
ratio exceeds configured threshold, checkpoint will be triggered for flushing dirty nat entries.
be triggered for flushing dirty nat entries.
What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes
Date: January 2016 Date: January 2016
Contact: "Shuoran Liu" <liushuoran@huawei.com> Contact: "Shuoran Liu" <liushuoran@huawei.com>
Description: Description: Shows total written kbytes issued to disk.
Shows total written kbytes issued to disk.
What: /sys/fs/f2fs/<disk>/features What: /sys/fs/f2fs/<disk>/features
Date: July 2017 Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Shows all enabled features in current device.
Shows all enabled features in current device.
What: /sys/fs/f2fs/<disk>/inject_rate What: /sys/fs/f2fs/<disk>/inject_rate
Date: May 2016 Date: May 2016
Contact: "Sheng Yong" <shengyong1@huawei.com> Contact: "Sheng Yong" <shengyong1@huawei.com>
Description: Description: Controls the injection rate of arbitrary faults.
Controls the injection rate.
What: /sys/fs/f2fs/<disk>/inject_type What: /sys/fs/f2fs/<disk>/inject_type
Date: May 2016 Date: May 2016
Contact: "Sheng Yong" <shengyong1@huawei.com> Contact: "Sheng Yong" <shengyong1@huawei.com>
Description: Description: Controls the injection type of arbitrary faults.
Controls the injection type.
What: /sys/fs/f2fs/<disk>/dirty_segments
Date: October 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of dirty segments.
What: /sys/fs/f2fs/<disk>/reserved_blocks What: /sys/fs/f2fs/<disk>/reserved_blocks
Date: June 2017 Date: June 2017
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Controls target reserved blocks in system, the threshold
Controls target reserved blocks in system, the threshold is soft, it could exceed current available user space.
is soft, it could exceed current available user space.
What: /sys/fs/f2fs/<disk>/current_reserved_blocks What: /sys/fs/f2fs/<disk>/current_reserved_blocks
Date: October 2017 Date: October 2017
Contact: "Yunlong Song" <yunlong.song@huawei.com> Contact: "Yunlong Song" <yunlong.song@huawei.com>
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Shows current reserved blocks in system, it may be temporarily
Shows current reserved blocks in system, it may be temporarily smaller than target_reserved_blocks, but will gradually
smaller than target_reserved_blocks, but will gradually increase to target_reserved_blocks when more free blocks are
increase to target_reserved_blocks when more free blocks are freed by user later.
freed by user later.
What: /sys/fs/f2fs/<disk>/gc_urgent What: /sys/fs/f2fs/<disk>/gc_urgent
Date: August 2017 Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Do background GC agressively when set. When gc_urgent = 1,
Do background GC agressively background thread starts to do GC by given gc_urgent_sleep_time
interval. It is set to 0 by default.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017 Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Description: Controls sleep time of GC urgent mode. Set to 500ms by default.
Controls sleep time of GC urgent mode
What: /sys/fs/f2fs/<disk>/readdir_ra What: /sys/fs/f2fs/<disk>/readdir_ra
Date: November 2017 Date: November 2017
Contact: "Sheng Yong" <shengyong1@huawei.com> Contact: "Sheng Yong" <shengyong1@huawei.com>
Description: Description: Controls readahead inode block in readdir. Enabled by default.
Controls readahead inode block in readdir.
What: /sys/fs/f2fs/<disk>/gc_pin_file_thresh
Date: January 2018
Contact: Jaegeuk Kim <jaegeuk@kernel.org>
Description: This indicates how many GC can be failed for the pinned
file. If it exceeds this, F2FS doesn't guarantee its pinning
state. 2048 trials is set by default.
What: /sys/fs/f2fs/<disk>/extension_list What: /sys/fs/f2fs/<disk>/extension_list
Date: Feburary 2018 Date: Feburary 2018
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Description: Used to control configure extension list:
Used to control configure extension list: - Query: cat /sys/fs/f2fs/<disk>/extension_list
- Query: cat /sys/fs/f2fs/<disk>/extension_list - Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
- Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list - [h] means add/del hot file extension
- [h] means add/del hot file extension - [c] means add/del cold file extension
- [c] means add/del cold file extension
What: /sys/fs/f2fs/<disk>/unusable What: /sys/fs/f2fs/<disk>/unusable
Date April 2019 Date April 2019
Contact: "Daniel Rosenberg" <drosen@google.com> Contact: "Daniel Rosenberg" <drosen@google.com>
Description: Description: If checkpoint=disable, it displays the number of blocks that
If checkpoint=disable, it displays the number of blocks that are unusable. are unusable.
If checkpoint=enable it displays the enumber of blocks that would be unusable If checkpoint=enable it displays the enumber of blocks that
if checkpoint=disable were to be set. would be unusable if checkpoint=disable were to be set.
What: /sys/fs/f2fs/<disk>/encoding What: /sys/fs/f2fs/<disk>/encoding
Date July 2019 Date July 2019
Contact: "Daniel Rosenberg" <drosen@google.com> Contact: "Daniel Rosenberg" <drosen@google.com>
Description: Description: Displays name and version of the encoding set for the filesystem.
Displays name and version of the encoding set for the filesystem. If no encoding is set, displays (none)
If no encoding is set, displays (none)
What: /sys/fs/f2fs/<disk>/free_segments
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of free segments in disk.
What: /sys/fs/f2fs/<disk>/cp_foreground_calls
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of checkpoint operations performed on demand. Available when
CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/cp_background_calls
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of checkpoint operations performed in the background to
free segments. Available when CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/gc_foreground_calls
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of garbage collection operations performed on demand.
Available when CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/gc_background_calls
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of garbage collection operations triggered in background.
Available when CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/moved_blocks_foreground
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of blocks moved by garbage collection in foreground.
Available when CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/moved_blocks_background
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Number of blocks moved by garbage collection in background.
Available when CONFIG_F2FS_STAT_FS=y.
What: /sys/fs/f2fs/<disk>/avg_vblocks
Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Average number of valid blocks.
Available when CONFIG_F2FS_STAT_FS=y.

View File

@@ -407,3 +407,16 @@ Contact: Kalesh Singh <kaleshsingh96@gmail.com>
Description: Description:
The /sys/power/suspend_stats/last_failed_step file contains The /sys/power/suspend_stats/last_failed_step file contains
the last failed step in the suspend/resume path. the last failed step in the suspend/resume path.
What: /sys/power/sync_on_suspend
Date: October 2019
Contact: Jonas Meurer <jonas@freesources.org>
Description:
This file controls whether or not the kernel will sync()
filesystems during system suspend (after freezing user space
and before suspending devices).
Writing a "1" to this file enables the sync() and writing a "0"
disables it. Reads from the file return the current value.
The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config
flag is unset, or "0" otherwise.

View File

@@ -0,0 +1,46 @@
What: Raise a uevent when a USB charger is inserted or removed
Date: 2020-01-14
KernelVersion: 5.6
Contact: linux-usb@vger.kernel.org
Description: There are two USB charger states:
USB_CHARGER_ABSENT
USB_CHARGER_PRESENT
There are five USB charger types:
USB_CHARGER_UNKNOWN_TYPE: Charger type is unknown
USB_CHARGER_SDP_TYPE: Standard Downstream Port
USB_CHARGER_CDP_TYPE: Charging Downstream Port
USB_CHARGER_DCP_TYPE: Dedicated Charging Port
USB_CHARGER_ACA_TYPE: Accessory Charging Adapter
https://www.usb.org/document-library/battery-charging-v12-spec-and-adopters-agreement
Here are two examples taken using udevadm monitor -p when
USB charger is online:
UDEV change /devices/soc0/usbphynop1 (platform)
ACTION=change
DEVPATH=/devices/soc0/usbphynop1
DRIVER=usb_phy_generic
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
OF_COMPATIBLE_0=usb-nop-xceiv
OF_COMPATIBLE_N=1
OF_FULLNAME=/usbphynop1
OF_NAME=usbphynop1
SEQNUM=2493
SUBSYSTEM=platform
USB_CHARGER_STATE=USB_CHARGER_PRESENT
USB_CHARGER_TYPE=USB_CHARGER_SDP_TYPE
USEC_INITIALIZED=227422826
USB charger is offline:
KERNEL change /devices/soc0/usbphynop1 (platform)
ACTION=change
DEVPATH=/devices/soc0/usbphynop1
DRIVER=usb_phy_generic
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
OF_COMPATIBLE_0=usb-nop-xceiv
OF_COMPATIBLE_N=1
OF_FULLNAME=/usbphynop1
OF_NAME=usbphynop1
SEQNUM=2494
SUBSYSTEM=platform
USB_CHARGER_STATE=USB_CHARGER_ABSENT
USB_CHARGER_TYPE=USB_CHARGER_UNKNOWN_TYPE

View File

@@ -283,5 +283,5 @@ or disabled (0). If 0 is found in any of the msi_bus files belonging
to bridges between the PCI root and the device, MSIs are disabled. to bridges between the PCI root and the device, MSIs are disabled.
It is also worth checking the device driver to see whether it supports MSIs. It is also worth checking the device driver to see whether it supports MSIs.
For example, it may contain calls to pci_irq_alloc_vectors() with the For example, it may contain calls to pci_alloc_irq_vectors() with the
PCI_IRQ_MSI or PCI_IRQ_MSIX flags. PCI_IRQ_MSI or PCI_IRQ_MSIX flags.

View File

@@ -1,4 +1,7 @@
.. _NMI_rcu_doc:
Using RCU to Protect Dynamic NMI Handlers Using RCU to Protect Dynamic NMI Handlers
=========================================
Although RCU is usually used to protect read-mostly data structures, Although RCU is usually used to protect read-mostly data structures,
@@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in
"arch/x86/kernel/traps.c". "arch/x86/kernel/traps.c".
The relevant pieces of code are listed below, each followed by a The relevant pieces of code are listed below, each followed by a
brief explanation. brief explanation::
static int dummy_nmi_callback(struct pt_regs *regs, int cpu) static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
{ {
@@ -18,12 +21,12 @@ brief explanation.
The dummy_nmi_callback() function is a "dummy" NMI handler that does The dummy_nmi_callback() function is a "dummy" NMI handler that does
nothing, but returns zero, thus saying that it did nothing, allowing nothing, but returns zero, thus saying that it did nothing, allowing
the NMI handler to take the default machine-specific action. the NMI handler to take the default machine-specific action::
static nmi_callback_t nmi_callback = dummy_nmi_callback; static nmi_callback_t nmi_callback = dummy_nmi_callback;
This nmi_callback variable is a global function pointer to the current This nmi_callback variable is a global function pointer to the current
NMI handler. NMI handler::
void do_nmi(struct pt_regs * regs, long error_code) void do_nmi(struct pt_regs * regs, long error_code)
{ {
@@ -53,11 +56,12 @@ anyway. However, in practice it is a good documentation aid, particularly
for anyone attempting to do something similar on Alpha or on systems for anyone attempting to do something similar on Alpha or on systems
with aggressive optimizing compilers. with aggressive optimizing compilers.
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha, Quick Quiz:
given that the code referenced by the pointer is read-only? Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
Back to the discussion of NMI and RCU... Back to the discussion of NMI and RCU::
void set_nmi_callback(nmi_callback_t callback) void set_nmi_callback(nmi_callback_t callback)
{ {
@@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler. Note that any
data that is to be used by the callback must be initialized up -before- data that is to be used by the callback must be initialized up -before-
the call to set_nmi_callback(). On architectures that do not order the call to set_nmi_callback(). On architectures that do not order
writes, the rcu_assign_pointer() ensures that the NMI handler sees the writes, the rcu_assign_pointer() ensures that the NMI handler sees the
initialized values. initialized values::
void unset_nmi_callback(void) void unset_nmi_callback(void)
{ {
@@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution
of it completes on all other CPUs. of it completes on all other CPUs.
One way to accomplish this is via synchronize_rcu(), perhaps as One way to accomplish this is via synchronize_rcu(), perhaps as
follows: follows::
unset_nmi_callback(); unset_nmi_callback();
synchronize_rcu(); synchronize_rcu();
@@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns.
Important note: for this to work, the architecture in question must Important note: for this to work, the architecture in question must
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
.. _answer_quick_quiz_NMI:
Answer to Quick Quiz Answer to Quick Quiz:
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
Why might the rcu_dereference_sched() be necessary on Alpha, given The caller to set_nmi_callback() might well have
that the code referenced by the pointer is read-only? initialized some data that is to be used by the new NMI
handler. In this case, the rcu_dereference_sched() would
be needed, because otherwise a CPU that received an NMI
just after the new handler was set might see the pointer
to the new NMI handler, but the old pre-initialized
version of the handler's data.
Answer: The caller to set_nmi_callback() might well have This same sad story can happen on other CPUs when using
initialized some data that is to be used by the new NMI a compiler with aggressive pointer-value speculation
handler. In this case, the rcu_dereference_sched() would optimizations.
be needed, because otherwise a CPU that received an NMI
just after the new handler was set might see the pointer
to the new NMI handler, but the old pre-initialized
version of the handler's data.
This same sad story can happen on other CPUs when using More important, the rcu_dereference_sched() makes it
a compiler with aggressive pointer-value speculation clear to someone reading the code that the pointer is
optimizations. being protected by RCU-sched.
More important, the rcu_dereference_sched() makes it
clear to someone reading the code that the pointer is
being protected by RCU-sched.

View File

@@ -1,19 +1,21 @@
Using RCU to Protect Read-Mostly Arrays .. _array_rcu_doc:
Using RCU to Protect Read-Mostly Arrays
=======================================
Although RCU is more commonly used to protect linked lists, it can Although RCU is more commonly used to protect linked lists, it can
also be used to protect arrays. Three situations are as follows: also be used to protect arrays. Three situations are as follows:
1. Hash Tables 1. :ref:`Hash Tables <hash_tables>`
2. Static Arrays 2. :ref:`Static Arrays <static_arrays>`
3. Resizeable Arrays 3. :ref:`Resizable Arrays <resizable_arrays>`
Each of these three situations involves an RCU-protected pointer to an Each of these three situations involves an RCU-protected pointer to an
array that is separately indexed. It might be tempting to consider use array that is separately indexed. It might be tempting to consider use
of RCU to instead protect the index into an array, however, this use of RCU to instead protect the index into an array, however, this use
case is -not- supported. The problem with RCU-protected indexes into case is **not** supported. The problem with RCU-protected indexes into
arrays is that compilers can play way too many optimization games with arrays is that compilers can play way too many optimization games with
integers, which means that the rules governing handling of these indexes integers, which means that the rules governing handling of these indexes
are far more trouble than they are worth. If RCU-protected indexes into are far more trouble than they are worth. If RCU-protected indexes into
@@ -24,16 +26,20 @@ to be safely used.
That aside, each of the three RCU-protected pointer situations are That aside, each of the three RCU-protected pointer situations are
described in the following sections. described in the following sections.
.. _hash_tables:
Situation 1: Hash Tables Situation 1: Hash Tables
------------------------
Hash tables are often implemented as an array, where each array entry Hash tables are often implemented as an array, where each array entry
has a linked-list hash chain. Each hash chain can be protected by RCU has a linked-list hash chain. Each hash chain can be protected by RCU
as described in the listRCU.txt document. This approach also applies as described in the listRCU.txt document. This approach also applies
to other array-of-list situations, such as radix trees. to other array-of-list situations, such as radix trees.
.. _static_arrays:
Situation 2: Static Arrays Situation 2: Static Arrays
--------------------------
Static arrays, where the data (rather than a pointer to the data) is Static arrays, where the data (rather than a pointer to the data) is
located in each array element, and where the array is never resized, located in each array element, and where the array is never resized,
@@ -41,13 +47,17 @@ have not been used with RCU. Rik van Riel recommends using seqlock in
this situation, which would also have minimal read-side overhead as long this situation, which would also have minimal read-side overhead as long
as updates are rare. as updates are rare.
Quick Quiz: Why is it so important that updates be rare when Quick Quiz:
using seqlock? Why is it so important that updates be rare when using seqlock?
:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
Situation 3: Resizeable Arrays .. _resizable_arrays:
Use of RCU for resizeable arrays is demonstrated by the grow_ary() Situation 3: Resizable Arrays
------------------------------
Use of RCU for resizable arrays is demonstrated by the grow_ary()
function formerly used by the System V IPC code. The array is used function formerly used by the System V IPC code. The array is used
to map from semaphore, message-queue, and shared-memory IDs to the data to map from semaphore, message-queue, and shared-memory IDs to the data
structure that represents the corresponding IPC construct. The grow_ary() structure that represents the corresponding IPC construct. The grow_ary()
@@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to
the new array, and invokes ipc_rcu_putref() to free up the old array. the new array, and invokes ipc_rcu_putref() to free up the old array.
Note that rcu_assign_pointer() is used to update the ids->entries pointer, Note that rcu_assign_pointer() is used to update the ids->entries pointer,
which includes any memory barriers required on whatever architecture which includes any memory barriers required on whatever architecture
you are running on. you are running on::
static int grow_ary(struct ipc_ids* ids, int newsize) static int grow_ary(struct ipc_ids* ids, int newsize)
{ {
@@ -112,7 +122,7 @@ a simple check suffices. The pointer to the structure corresponding
to the desired IPC object is placed in "out", with NULL indicating to the desired IPC object is placed in "out", with NULL indicating
a non-existent entry. After acquiring "out->lock", the "out->deleted" a non-existent entry. After acquiring "out->lock", the "out->deleted"
flag indicates whether the IPC object is in the process of being flag indicates whether the IPC object is in the process of being
deleted, and, if not, the pointer is returned. deleted, and, if not, the pointer is returned::
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
{ {
@@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned.
return out; return out;
} }
.. _answer_quick_quiz_seqlock:
Answer to Quick Quiz: Answer to Quick Quiz:
Why is it so important that updates be rare when using seqlock?
The reason that it is important that updates be rare when The reason that it is important that updates be rare when
using seqlock is that frequent updates can livelock readers. using seqlock is that frequent updates can livelock readers.

View File

@@ -7,8 +7,13 @@ RCU concepts
.. toctree:: .. toctree::
:maxdepth: 3 :maxdepth: 3
arrayRCU
rcubarrier
rcu_dereference
whatisRCU
rcu rcu
listRCU listRCU
NMI-RCU
UP UP
Design/Memory-Ordering/Tree-RCU-Memory-Ordering Design/Memory-Ordering/Tree-RCU-Memory-Ordering

View File

@@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU
read-side critical section, which again would have suppressed the read-side critical section, which again would have suppressed the
above lockdep-RCU splat. above lockdep-RCU splat.
But in this particular case, we don't actually deference the pointer But in this particular case, we don't actually dereference the pointer
returned from rcu_dereference(). Instead, that pointer is just compared returned from rcu_dereference(). Instead, that pointer is just compared
to the cic pointer, which means that the rcu_dereference() can be replaced to the cic pointer, which means that the rcu_dereference() can be replaced
by rcu_access_pointer() as follows: by rcu_access_pointer() as follows:

View File

@@ -1,4 +1,7 @@
.. _rcu_dereference_doc:
PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
===============================================================
Most of the time, you can use values from rcu_dereference() or one of Most of the time, you can use values from rcu_dereference() or one of
the similar primitives without worries. Dereferencing (prefix "*"), the similar primitives without worries. Dereferencing (prefix "*"),
@@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely.
It is nevertheless possible to get into trouble with other operations. It is nevertheless possible to get into trouble with other operations.
Follow these rules to keep your RCU code working properly: Follow these rules to keep your RCU code working properly:
o You must use one of the rcu_dereference() family of primitives - You must use one of the rcu_dereference() family of primitives
to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
will complain. Worse yet, your code can see random memory-corruption will complain. Worse yet, your code can see random memory-corruption
bugs due to games that compilers and DEC Alpha can play. bugs due to games that compilers and DEC Alpha can play.
@@ -25,24 +28,24 @@ o You must use one of the rcu_dereference() family of primitives
for an example where the compiler can in fact deduce the exact for an example where the compiler can in fact deduce the exact
value of the pointer, and thus cause misordering. value of the pointer, and thus cause misordering.
o You are only permitted to use rcu_dereference on pointer values. - You are only permitted to use rcu_dereference on pointer values.
The compiler simply knows too much about integral values to The compiler simply knows too much about integral values to
trust it to carry dependencies through integer operations. trust it to carry dependencies through integer operations.
There are a very few exceptions, namely that you can temporarily There are a very few exceptions, namely that you can temporarily
cast the pointer to uintptr_t in order to: cast the pointer to uintptr_t in order to:
o Set bits and clear bits down in the must-be-zero low-order - Set bits and clear bits down in the must-be-zero low-order
bits of that pointer. This clearly means that the pointer bits of that pointer. This clearly means that the pointer
must have alignment constraints, for example, this does must have alignment constraints, for example, this does
-not- work in general for char* pointers. -not- work in general for char* pointers.
o XOR bits to translate pointers, as is done in some - XOR bits to translate pointers, as is done in some
classic buddy-allocator algorithms. classic buddy-allocator algorithms.
It is important to cast the value back to pointer before It is important to cast the value back to pointer before
doing much of anything else with it. doing much of anything else with it.
o Avoid cancellation when using the "+" and "-" infix arithmetic - Avoid cancellation when using the "+" and "-" infix arithmetic
operators. For example, for a given variable "x", avoid operators. For example, for a given variable "x", avoid
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its "(x-(uintptr_t)x)" for char* pointers. The compiler is within its
rights to substitute zero for this sort of expression, so that rights to substitute zero for this sort of expression, so that
@@ -54,16 +57,16 @@ o Avoid cancellation when using the "+" and "-" infix arithmetic
"p+a-b" is safe because its value still necessarily depends on "p+a-b" is safe because its value still necessarily depends on
the rcu_dereference(), thus maintaining proper ordering. the rcu_dereference(), thus maintaining proper ordering.
o If you are using RCU to protect JITed functions, so that the - If you are using RCU to protect JITed functions, so that the
"()" function-invocation operator is applied to a value obtained "()" function-invocation operator is applied to a value obtained
(directly or indirectly) from rcu_dereference(), you may need to (directly or indirectly) from rcu_dereference(), you may need to
interact directly with the hardware to flush instruction caches. interact directly with the hardware to flush instruction caches.
This issue arises on some systems when a newly JITed function is This issue arises on some systems when a newly JITed function is
using the same memory that was used by an earlier JITed function. using the same memory that was used by an earlier JITed function.
o Do not use the results from relational operators ("==", "!=", - Do not use the results from relational operators ("==", "!=",
">", ">=", "<", or "<=") when dereferencing. For example, ">", ">=", "<", or "<=") when dereferencing. For example,
the following (quite strange) code is buggy: the following (quite strange) code is buggy::
int *p; int *p;
int *q; int *q;
@@ -81,11 +84,11 @@ o Do not use the results from relational operators ("==", "!=",
after such branches, but can speculate loads, which can again after such branches, but can speculate loads, which can again
result in misordering bugs. result in misordering bugs.
o Be very careful about comparing pointers obtained from - Be very careful about comparing pointers obtained from
rcu_dereference() against non-NULL values. As Linus Torvalds rcu_dereference() against non-NULL values. As Linus Torvalds
explained, if the two pointers are equal, the compiler could explained, if the two pointers are equal, the compiler could
substitute the pointer you are comparing against for the pointer substitute the pointer you are comparing against for the pointer
obtained from rcu_dereference(). For example: obtained from rcu_dereference(). For example::
p = rcu_dereference(gp); p = rcu_dereference(gp);
if (p == &default_struct) if (p == &default_struct)
@@ -93,7 +96,7 @@ o Be very careful about comparing pointers obtained from
Because the compiler now knows that the value of "p" is exactly Because the compiler now knows that the value of "p" is exactly
the address of the variable "default_struct", it is free to the address of the variable "default_struct", it is free to
transform this code into the following: transform this code into the following::
p = rcu_dereference(gp); p = rcu_dereference(gp);
if (p == &default_struct) if (p == &default_struct)
@@ -105,14 +108,14 @@ o Be very careful about comparing pointers obtained from
However, comparisons are OK in the following cases: However, comparisons are OK in the following cases:
o The comparison was against the NULL pointer. If the - The comparison was against the NULL pointer. If the
compiler knows that the pointer is NULL, you had better compiler knows that the pointer is NULL, you had better
not be dereferencing it anyway. If the comparison is not be dereferencing it anyway. If the comparison is
non-equal, the compiler is none the wiser. Therefore, non-equal, the compiler is none the wiser. Therefore,
it is safe to compare pointers from rcu_dereference() it is safe to compare pointers from rcu_dereference()
against NULL pointers. against NULL pointers.
o The pointer is never dereferenced after being compared. - The pointer is never dereferenced after being compared.
Since there are no subsequent dereferences, the compiler Since there are no subsequent dereferences, the compiler
cannot use anything it learned from the comparison cannot use anything it learned from the comparison
to reorder the non-existent subsequent dereferences. to reorder the non-existent subsequent dereferences.
@@ -124,31 +127,31 @@ o Be very careful about comparing pointers obtained from
dereferenced, rcu_access_pointer() should be used in place dereferenced, rcu_access_pointer() should be used in place
of rcu_dereference(). of rcu_dereference().
o The comparison is against a pointer that references memory - The comparison is against a pointer that references memory
that was initialized "a long time ago." The reason that was initialized "a long time ago." The reason
this is safe is that even if misordering occurs, the this is safe is that even if misordering occurs, the
misordering will not affect the accesses that follow misordering will not affect the accesses that follow
the comparison. So exactly how long ago is "a long the comparison. So exactly how long ago is "a long
time ago"? Here are some possibilities: time ago"? Here are some possibilities:
o Compile time. - Compile time.
o Boot time. - Boot time.
o Module-init time for module code. - Module-init time for module code.
o Prior to kthread creation for kthread code. - Prior to kthread creation for kthread code.
o During some prior acquisition of the lock that - During some prior acquisition of the lock that
we now hold. we now hold.
o Before mod_timer() time for a timer handler. - Before mod_timer() time for a timer handler.
There are many other possibilities involving the Linux There are many other possibilities involving the Linux
kernel's wide array of primitives that cause code to kernel's wide array of primitives that cause code to
be invoked at a later time. be invoked at a later time.
o The pointer being compared against also came from - The pointer being compared against also came from
rcu_dereference(). In this case, both pointers depend rcu_dereference(). In this case, both pointers depend
on one rcu_dereference() or another, so you get proper on one rcu_dereference() or another, so you get proper
ordering either way. ordering either way.
@@ -159,13 +162,13 @@ o Be very careful about comparing pointers obtained from
of such an RCU usage bug is shown in the section titled of such an RCU usage bug is shown in the section titled
"EXAMPLE OF AMPLIFIED RCU-USAGE BUG". "EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
o All of the accesses following the comparison are stores, - All of the accesses following the comparison are stores,
so that a control dependency preserves the needed ordering. so that a control dependency preserves the needed ordering.
That said, it is easy to get control dependencies wrong. That said, it is easy to get control dependencies wrong.
Please see the "CONTROL DEPENDENCIES" section of Please see the "CONTROL DEPENDENCIES" section of
Documentation/memory-barriers.txt for more details. Documentation/memory-barriers.txt for more details.
o The pointers are not equal -and- the compiler does - The pointers are not equal -and- the compiler does
not have enough information to deduce the value of the not have enough information to deduce the value of the
pointer. Note that the volatile cast in rcu_dereference() pointer. Note that the volatile cast in rcu_dereference()
will normally prevent the compiler from knowing too much. will normally prevent the compiler from knowing too much.
@@ -175,7 +178,7 @@ o Be very careful about comparing pointers obtained from
comparison will provide exactly the information that the comparison will provide exactly the information that the
compiler needs to deduce the value of the pointer. compiler needs to deduce the value of the pointer.
o Disable any value-speculation optimizations that your compiler - Disable any value-speculation optimizations that your compiler
might provide, especially if you are making use of feedback-based might provide, especially if you are making use of feedback-based
optimizations that take data collected from prior runs. Such optimizations that take data collected from prior runs. Such
value-speculation optimizations reorder operations by design. value-speculation optimizations reorder operations by design.
@@ -188,11 +191,12 @@ o Disable any value-speculation optimizations that your compiler
EXAMPLE OF AMPLIFIED RCU-USAGE BUG EXAMPLE OF AMPLIFIED RCU-USAGE BUG
----------------------------------
Because updaters can run concurrently with RCU readers, RCU readers can Because updaters can run concurrently with RCU readers, RCU readers can
see stale and/or inconsistent values. If RCU readers need fresh or see stale and/or inconsistent values. If RCU readers need fresh or
consistent values, which they sometimes do, they need to take proper consistent values, which they sometimes do, they need to take proper
precautions. To see this, consider the following code fragment: precautions. To see this, consider the following code fragment::
struct foo { struct foo {
int a; int a;
@@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point.
But suppose that the reader needs a consistent view? But suppose that the reader needs a consistent view?
Then one approach is to use locking, for example, as follows: Then one approach is to use locking, for example, as follows::
struct foo { struct foo {
int a; int a;
@@ -299,6 +303,7 @@ As always, use the right tool for the job!
EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
-----------------------------------------
If a pointer obtained from rcu_dereference() compares not-equal to some If a pointer obtained from rcu_dereference() compares not-equal to some
other pointer, the compiler normally has no clue what the value of the other pointer, the compiler normally has no clue what the value of the
@@ -308,7 +313,7 @@ guarantees that RCU depends on. And the volatile cast in rcu_dereference()
should prevent the compiler from guessing the value. should prevent the compiler from guessing the value.
But without rcu_dereference(), the compiler knows more than you might But without rcu_dereference(), the compiler knows more than you might
expect. Consider the following code fragment: expect. Consider the following code fragment::
struct foo { struct foo {
int a; int a;
@@ -354,6 +359,7 @@ dereference the resulting pointer.
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
------------------------------------------------------------
First, please avoid using rcu_dereference_raw() and also please avoid First, please avoid using rcu_dereference_raw() and also please avoid
using rcu_dereference_check() and rcu_dereference_protected() with a using rcu_dereference_check() and rcu_dereference_protected() with a
@@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations:
2. If the access might be within an RCU read-side critical section 2. If the access might be within an RCU read-side critical section
on the one hand, or protected by (say) my_lock on the other, on the one hand, or protected by (say) my_lock on the other,
use rcu_dereference_check(), for example: use rcu_dereference_check(), for example::
p1 = rcu_dereference_check(p->rcu_protected_pointer, p1 = rcu_dereference_check(p->rcu_protected_pointer,
lockdep_is_held(&my_lock)); lockdep_is_held(&my_lock));
@@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations:
3. If the access might be within an RCU read-side critical section 3. If the access might be within an RCU read-side critical section
on the one hand, or protected by either my_lock or your_lock on on the one hand, or protected by either my_lock or your_lock on
the other, again use rcu_dereference_check(), for example: the other, again use rcu_dereference_check(), for example::
p1 = rcu_dereference_check(p->rcu_protected_pointer, p1 = rcu_dereference_check(p->rcu_protected_pointer,
lockdep_is_held(&my_lock) || lockdep_is_held(&my_lock) ||
lockdep_is_held(&your_lock)); lockdep_is_held(&your_lock));
4. If the access is on the update side, so that it is always protected 4. If the access is on the update side, so that it is always protected
by my_lock, use rcu_dereference_protected(): by my_lock, use rcu_dereference_protected()::
p1 = rcu_dereference_protected(p->rcu_protected_pointer, p1 = rcu_dereference_protected(p->rcu_protected_pointer,
lockdep_is_held(&my_lock)); lockdep_is_held(&my_lock));
@@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations:
SPARSE CHECKING OF RCU-PROTECTED POINTERS SPARSE CHECKING OF RCU-PROTECTED POINTERS
-----------------------------------------
The sparse static-analysis tool checks for direct access to RCU-protected The sparse static-analysis tool checks for direct access to RCU-protected
pointers, which can result in "interesting" bugs due to compiler pointers, which can result in "interesting" bugs due to compiler
optimizations involving invented loads and perhaps also load tearing. optimizations involving invented loads and perhaps also load tearing.
For example, suppose someone mistakenly does something like this: For example, suppose someone mistakenly does something like this::
p = q->rcu_protected_pointer; p = q->rcu_protected_pointer;
do_something_with(p->a); do_something_with(p->a);
do_something_else_with(p->b); do_something_else_with(p->b);
If register pressure is high, the compiler might optimize "p" out If register pressure is high, the compiler might optimize "p" out
of existence, transforming the code to something like this: of existence, transforming the code to something like this::
do_something_with(q->rcu_protected_pointer->a); do_something_with(q->rcu_protected_pointer->a);
do_something_else_with(q->rcu_protected_pointer->b); do_something_else_with(q->rcu_protected_pointer->b);
@@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair
of pointers, which also might fatally disappoint your code. of pointers, which also might fatally disappoint your code.
These problems could have been avoided simply by making the code instead These problems could have been avoided simply by making the code instead
read as follows: read as follows::
p = rcu_dereference(q->rcu_protected_pointer); p = rcu_dereference(q->rcu_protected_pointer);
do_something_with(p->a); do_something_with(p->a);
@@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if
this pointer is accessed directly. It will also cause sparse to complain this pointer is accessed directly. It will also cause sparse to complain
if a pointer not marked with "__rcu" is accessed using rcu_dereference() if a pointer not marked with "__rcu" is accessed using rcu_dereference()
and friends. For example, ->rcu_protected_pointer might be declared as and friends. For example, ->rcu_protected_pointer might be declared as
follows: follows::
struct foo __rcu *rcu_protected_pointer; struct foo __rcu *rcu_protected_pointer;

View File

@@ -1,4 +1,7 @@
.. _rcu_barrier:
RCU and Unloadable Modules RCU and Unloadable Modules
==========================
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] [Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
@@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their
presence? There is a synchronize_rcu() primitive that blocks until all presence? There is a synchronize_rcu() primitive that blocks until all
pre-existing readers have completed. An updater wishing to delete an pre-existing readers have completed. An updater wishing to delete an
element p from a linked list might do the following, while holding an element p from a linked list might do the following, while holding an
appropriate lock, of course: appropriate lock, of course::
list_del_rcu(p); list_del_rcu(p);
synchronize_rcu(); synchronize_rcu();
@@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an
rcu_head struct placed within the RCU-protected data structure and rcu_head struct placed within the RCU-protected data structure and
another pointer to a function that may be invoked later to free that another pointer to a function that may be invoked later to free that
structure. Code to delete an element p from the linked list from IRQ structure. Code to delete an element p from the linked list from IRQ
context might then be as follows: context might then be as follows::
list_del_rcu(p); list_del_rcu(p);
call_rcu(&p->rcu, p_callback); call_rcu(&p->rcu, p_callback);
Since call_rcu() never blocks, this code can safely be used from within Since call_rcu() never blocks, this code can safely be used from within
IRQ context. The function p_callback() might be defined as follows: IRQ context. The function p_callback() might be defined as follows::
static void p_callback(struct rcu_head *rp) static void p_callback(struct rcu_head *rp)
{ {
@@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows:
Unloading Modules That Use call_rcu() Unloading Modules That Use call_rcu()
-------------------------------------
But what if p_callback is defined in an unloadable module? But what if p_callback is defined in an unloadable module?
@@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies.
rcu_barrier() rcu_barrier()
-------------
We instead need the rcu_barrier() primitive. Rather than waiting for We instead need the rcu_barrier() primitive. Rather than waiting for
a grace period to elapse, rcu_barrier() waits for all outstanding RCU a grace period to elapse, rcu_barrier() waits for all outstanding RCU
callbacks to complete. Please note that rcu_barrier() does -not- imply callbacks to complete. Please note that rcu_barrier() does **not** imply
synchronize_rcu(), in particular, if there are no RCU callbacks queued synchronize_rcu(), in particular, if there are no RCU callbacks queued
anywhere, rcu_barrier() is within its rights to return immediately, anywhere, rcu_barrier() is within its rights to return immediately,
without waiting for a grace period to elapse. without waiting for a grace period to elapse.
@@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu(). If your
module uses multiple flavors of call_rcu(), then it must also use multiple module uses multiple flavors of call_rcu(), then it must also use multiple
flavors of rcu_barrier() when unloading that module. For example, if flavors of rcu_barrier() when unloading that module. For example, if
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
srcu_struct_2(), then the following three lines of code will be required srcu_struct_2, then the following three lines of code will be required
when unloading: when unloading::
1 rcu_barrier(); 1 rcu_barrier();
2 srcu_barrier(&srcu_struct_1); 2 srcu_barrier(&srcu_struct_1);
3 srcu_barrier(&srcu_struct_2); 3 srcu_barrier(&srcu_struct_2);
The rcutorture module makes use of rcu_barrier() in its exit function The rcutorture module makes use of rcu_barrier() in its exit function
as follows: as follows::
1 static void 1 static void
2 rcu_torture_cleanup(void) 2 rcu_torture_cleanup(void)
3 { 3 {
4 int i; 4 int i;
5 5
6 fullstop = 1; 6 fullstop = 1;
7 if (shuffler_task != NULL) { 7 if (shuffler_task != NULL) {
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
9 kthread_stop(shuffler_task); 9 kthread_stop(shuffler_task);
10 } 10 }
11 shuffler_task = NULL; 11 shuffler_task = NULL;
12 12
13 if (writer_task != NULL) { 13 if (writer_task != NULL) {
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
15 kthread_stop(writer_task); 15 kthread_stop(writer_task);
16 } 16 }
17 writer_task = NULL; 17 writer_task = NULL;
18 18
19 if (reader_tasks != NULL) { 19 if (reader_tasks != NULL) {
20 for (i = 0; i < nrealreaders; i++) { 20 for (i = 0; i < nrealreaders; i++) {
21 if (reader_tasks[i] != NULL) { 21 if (reader_tasks[i] != NULL) {
22 VERBOSE_PRINTK_STRING( 22 VERBOSE_PRINTK_STRING(
23 "Stopping rcu_torture_reader task"); 23 "Stopping rcu_torture_reader task");
24 kthread_stop(reader_tasks[i]); 24 kthread_stop(reader_tasks[i]);
25 } 25 }
26 reader_tasks[i] = NULL; 26 reader_tasks[i] = NULL;
27 } 27 }
28 kfree(reader_tasks); 28 kfree(reader_tasks);
29 reader_tasks = NULL; 29 reader_tasks = NULL;
30 } 30 }
31 rcu_torture_current = NULL; 31 rcu_torture_current = NULL;
32 32
33 if (fakewriter_tasks != NULL) { 33 if (fakewriter_tasks != NULL) {
34 for (i = 0; i < nfakewriters; i++) { 34 for (i = 0; i < nfakewriters; i++) {
35 if (fakewriter_tasks[i] != NULL) { 35 if (fakewriter_tasks[i] != NULL) {
36 VERBOSE_PRINTK_STRING( 36 VERBOSE_PRINTK_STRING(
37 "Stopping rcu_torture_fakewriter task"); 37 "Stopping rcu_torture_fakewriter task");
38 kthread_stop(fakewriter_tasks[i]); 38 kthread_stop(fakewriter_tasks[i]);
39 } 39 }
40 fakewriter_tasks[i] = NULL; 40 fakewriter_tasks[i] = NULL;
41 } 41 }
42 kfree(fakewriter_tasks); 42 kfree(fakewriter_tasks);
43 fakewriter_tasks = NULL; 43 fakewriter_tasks = NULL;
44 } 44 }
45 45
46 if (stats_task != NULL) { 46 if (stats_task != NULL) {
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
48 kthread_stop(stats_task); 48 kthread_stop(stats_task);
49 } 49 }
50 stats_task = NULL; 50 stats_task = NULL;
51 51
52 /* Wait for all RCU callbacks to fire. */ 52 /* Wait for all RCU callbacks to fire. */
53 rcu_barrier(); 53 rcu_barrier();
54 54
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
56 56
57 if (cur_ops->cleanup != NULL) 57 if (cur_ops->cleanup != NULL)
58 cur_ops->cleanup(); 58 cur_ops->cleanup();
59 if (atomic_read(&n_rcu_torture_error)) 59 if (atomic_read(&n_rcu_torture_error))
60 rcu_torture_print_module_parms("End of test: FAILURE"); 60 rcu_torture_print_module_parms("End of test: FAILURE");
61 else 61 else
62 rcu_torture_print_module_parms("End of test: SUCCESS"); 62 rcu_torture_print_module_parms("End of test: SUCCESS");
63 } 63 }
Line 6 sets a global variable that prevents any RCU callbacks from Line 6 sets a global variable that prevents any RCU callbacks from
re-posting themselves. This will not be necessary in most cases, since re-posting themselves. This will not be necessary in most cases, since
@@ -176,9 +181,14 @@ for any pre-existing callbacks to complete.
Then lines 55-62 print status and do operation-specific cleanup, and Then lines 55-62 print status and do operation-specific cleanup, and
then return, permitting the module-unload operation to be completed. then return, permitting the module-unload operation to be completed.
Quick Quiz #1: Is there any other situation where rcu_barrier() might .. _rcubarrier_quiz_1:
Quick Quiz #1:
Is there any other situation where rcu_barrier() might
be required? be required?
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
Your module might have additional complications. For example, if your Your module might have additional complications. For example, if your
module invokes call_rcu() from timers, you will need to first cancel all module invokes call_rcu() from timers, you will need to first cancel all
the timers, and only then invoke rcu_barrier() to wait for any remaining the timers, and only then invoke rcu_barrier() to wait for any remaining
@@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke
rcu_barrier() before unloading. Similarly, if your module uses rcu_barrier() before unloading. Similarly, if your module uses
call_srcu(), you will need to invoke srcu_barrier() before unloading, call_srcu(), you will need to invoke srcu_barrier() before unloading,
and on the same srcu_struct structure. If your module uses call_rcu() and on the same srcu_struct structure. If your module uses call_rcu()
-and- call_srcu(), then you will need to invoke rcu_barrier() -and- **and** call_srcu(), then you will need to invoke rcu_barrier() **and**
srcu_barrier(). srcu_barrier().
Implementing rcu_barrier() Implementing rcu_barrier()
--------------------------
Dipankar Sarma's implementation of rcu_barrier() makes use of the fact Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
that RCU callbacks are never reordered once queued on one of the per-CPU that RCU callbacks are never reordered once queued on one of the per-CPU
@@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU
callback queues, and then waits until they have all started executing, at callback queues, and then waits until they have all started executing, at
which point, all earlier RCU callbacks are guaranteed to have completed. which point, all earlier RCU callbacks are guaranteed to have completed.
The original code for rcu_barrier() was as follows: The original code for rcu_barrier() was as follows::
1 void rcu_barrier(void) 1 void rcu_barrier(void)
2 { 2 {
3 BUG_ON(in_interrupt()); 3 BUG_ON(in_interrupt());
4 /* Take cpucontrol mutex to protect against CPU hotplug */ 4 /* Take cpucontrol mutex to protect against CPU hotplug */
5 mutex_lock(&rcu_barrier_mutex); 5 mutex_lock(&rcu_barrier_mutex);
6 init_completion(&rcu_barrier_completion); 6 init_completion(&rcu_barrier_completion);
7 atomic_set(&rcu_barrier_cpu_count, 0); 7 atomic_set(&rcu_barrier_cpu_count, 0);
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
9 wait_for_completion(&rcu_barrier_completion); 9 wait_for_completion(&rcu_barrier_completion);
10 mutex_unlock(&rcu_barrier_mutex); 10 mutex_unlock(&rcu_barrier_mutex);
11 } 11 }
Line 3 verifies that the caller is in process context, and lines 5 and 10 Line 3 verifies that the caller is in process context, and lines 5 and 10
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
@@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this
still gives the general idea. still gives the general idea.
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
to post an RCU callback, as follows: to post an RCU callback, as follows::
1 static void rcu_barrier_func(void *notused) 1 static void rcu_barrier_func(void *notused)
2 { 2 {
3 int cpu = smp_processor_id(); 3 int cpu = smp_processor_id();
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
5 struct rcu_head *head; 5 struct rcu_head *head;
6 6
7 head = &rdp->barrier; 7 head = &rdp->barrier;
8 atomic_inc(&rcu_barrier_cpu_count); 8 atomic_inc(&rcu_barrier_cpu_count);
9 call_rcu(head, rcu_barrier_callback); 9 call_rcu(head, rcu_barrier_callback);
10 } 10 }
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
which contains the struct rcu_head that needed for the later call to which contains the struct rcu_head that needed for the later call to
@@ -248,20 +259,25 @@ the current CPU's queue.
The rcu_barrier_callback() function simply atomically decrements the The rcu_barrier_callback() function simply atomically decrements the
rcu_barrier_cpu_count variable and finalizes the completion when it rcu_barrier_cpu_count variable and finalizes the completion when it
reaches zero, as follows: reaches zero, as follows::
1 static void rcu_barrier_callback(struct rcu_head *notused) 1 static void rcu_barrier_callback(struct rcu_head *notused)
2 { 2 {
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
4 complete(&rcu_barrier_completion); 4 complete(&rcu_barrier_completion);
5 } 5 }
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes .. _rcubarrier_quiz_2:
Quick Quiz #2:
What happens if CPU 0's rcu_barrier_func() executes
immediately (thus incrementing rcu_barrier_cpu_count to the immediately (thus incrementing rcu_barrier_cpu_count to the
value one), but the other CPU's rcu_barrier_func() invocations value one), but the other CPU's rcu_barrier_func() invocations
are delayed for a full grace period? Couldn't this result in are delayed for a full grace period? Couldn't this result in
rcu_barrier() returning prematurely? rcu_barrier() returning prematurely?
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
The current rcu_barrier() implementation is more complex, due to the need The current rcu_barrier() implementation is more complex, due to the need
to avoid disturbing idle CPUs (especially on battery-powered systems) to avoid disturbing idle CPUs (especially on battery-powered systems)
and the need to minimally disturb non-idle CPUs in real-time systems. and the need to minimally disturb non-idle CPUs in real-time systems.
@@ -269,6 +285,7 @@ However, the code above illustrates the concepts.
rcu_barrier() Summary rcu_barrier() Summary
---------------------
The rcu_barrier() primitive has seen relatively little use, since most The rcu_barrier() primitive has seen relatively little use, since most
code using RCU is in the core kernel rather than in modules. However, if code using RCU is in the core kernel rather than in modules. However, if
@@ -277,8 +294,12 @@ so that your module may be safely unloaded.
Answers to Quick Quizzes Answers to Quick Quizzes
------------------------
Quick Quiz #1: Is there any other situation where rcu_barrier() might .. _answer_rcubarrier_quiz_1:
Quick Quiz #1:
Is there any other situation where rcu_barrier() might
be required? be required?
Answer: Interestingly enough, rcu_barrier() was not originally Answer: Interestingly enough, rcu_barrier() was not originally
@@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally
implementing rcutorture, and found that rcu_barrier() solves implementing rcutorture, and found that rcu_barrier() solves
this problem as well. this problem as well.
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes :ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
.. _answer_rcubarrier_quiz_2:
Quick Quiz #2:
What happens if CPU 0's rcu_barrier_func() executes
immediately (thus incrementing rcu_barrier_cpu_count to the immediately (thus incrementing rcu_barrier_cpu_count to the
value one), but the other CPU's rcu_barrier_func() invocations value one), but the other CPU's rcu_barrier_func() invocations
are delayed for a full grace period? Couldn't this result in are delayed for a full grace period? Couldn't this result in
@@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
is to add an rcu_read_lock() before line 8 of rcu_barrier() is to add an rcu_read_lock() before line 8 of rcu_barrier()
and an rcu_read_unlock() after line 8 of this same function. If and an rcu_read_unlock() after line 8 of this same function. If
you can think of a better change, please let me know! you can think of a better change, please let me know!
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`

View File

@@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
for each CPU: for each CPU:
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
The "last_accelerate:" prints the low-order 16 bits (in hex) of the The "last_accelerate:" prints the low-order 16 bits (in hex) of the
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
status, so that an "l" indicates that all callbacks were lazy at the start processing is enabled.
of the last idle period and an "L" indicates that there are currently
no non-lazy callbacks (in both cases, "." is printed otherwise, as
shown above) and "D" indicates that dyntick-idle processing is enabled
("." is printed otherwise, for example, if disabled via the "nohz="
kernel boot parameter).
If the grace period ends just as the stall warning starts printing, If the grace period ends just as the stall warning starts printing,
there will be a spurious stall-warning message, which will include there will be a spurious stall-warning message, which will include

View File

@@ -1,15 +1,18 @@
.. _whatisrcu_doc:
What is RCU? -- "Read, Copy, Update" What is RCU? -- "Read, Copy, Update"
======================================
Please note that the "What is RCU?" LWN series is an excellent place Please note that the "What is RCU?" LWN series is an excellent place
to start learning about RCU: to start learning about RCU:
1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ | 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ | 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ | 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ | 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
2010 Big API Table http://lwn.net/Articles/419086/ | 2010 Big API Table http://lwn.net/Articles/419086/
5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ | 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
2014 Big API Table http://lwn.net/Articles/609973/ | 2014 Big API Table http://lwn.net/Articles/609973/
What is RCU? What is RCU?
@@ -24,14 +27,21 @@ the experience has been that different people must take different paths
to arrive at an understanding of RCU. This document provides several to arrive at an understanding of RCU. This document provides several
different paths, as follows: different paths, as follows:
1. RCU OVERVIEW :ref:`1. RCU OVERVIEW <1_whatisRCU>`
2. WHAT IS RCU'S CORE API?
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? :ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>`
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? :ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
6. ANALOGY WITH READER-WRITER LOCKING
7. FULL LIST OF RCU APIs :ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
8. ANSWERS TO QUICK QUIZZES
:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
People who prefer starting with a conceptual overview should focus on People who prefer starting with a conceptual overview should focus on
Section 1, though most readers will profit by reading this section at Section 1, though most readers will profit by reading this section at
@@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really
that type of person, you have perused the source code and will therefore that type of person, you have perused the source code and will therefore
never need this document anyway. ;-) never need this document anyway. ;-)
.. _1_whatisRCU:
1. RCU OVERVIEW 1. RCU OVERVIEW
----------------
The basic idea behind RCU is to split updates into "removal" and The basic idea behind RCU is to split updates into "removal" and
"reclamation" phases. The removal phase removes references to data items "reclamation" phases. The removal phase removes references to data items
@@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given
that readers are not doing any sort of synchronization operations??? that readers are not doing any sort of synchronization operations???
Read on to learn about how RCU's API makes this easy. Read on to learn about how RCU's API makes this easy.
.. _2_whatisRCU:
2. WHAT IS RCU'S CORE API? 2. WHAT IS RCU'S CORE API?
---------------------------
The core RCU API is quite small: The core RCU API is quite small:
@@ -136,7 +150,7 @@ later. See the kernel docbook documentation for more info, or look directly
at the function header comments. at the function header comments.
rcu_read_lock() rcu_read_lock()
^^^^^^^^^^^^^^^
void rcu_read_lock(void); void rcu_read_lock(void);
Used by a reader to inform the reclaimer that the reader is Used by a reader to inform the reclaimer that the reader is
@@ -150,7 +164,7 @@ rcu_read_lock()
longer-term references to data structures. longer-term references to data structures.
rcu_read_unlock() rcu_read_unlock()
^^^^^^^^^^^^^^^^^
void rcu_read_unlock(void); void rcu_read_unlock(void);
Used by a reader to inform the reclaimer that the reader is Used by a reader to inform the reclaimer that the reader is
@@ -158,15 +172,15 @@ rcu_read_unlock()
read-side critical sections may be nested and/or overlapping. read-side critical sections may be nested and/or overlapping.
synchronize_rcu() synchronize_rcu()
^^^^^^^^^^^^^^^^^
void synchronize_rcu(void); void synchronize_rcu(void);
Marks the end of updater code and the beginning of reclaimer Marks the end of updater code and the beginning of reclaimer
code. It does this by blocking until all pre-existing RCU code. It does this by blocking until all pre-existing RCU
read-side critical sections on all CPUs have completed. read-side critical sections on all CPUs have completed.
Note that synchronize_rcu() will -not- necessarily wait for Note that synchronize_rcu() will **not** necessarily wait for
any subsequent RCU read-side critical sections to complete. any subsequent RCU read-side critical sections to complete.
For example, consider the following sequence of events: For example, consider the following sequence of events::
CPU 0 CPU 1 CPU 2 CPU 0 CPU 1 CPU 2
----------------- ------------------------- --------------- ----------------- ------------------------- ---------------
@@ -182,7 +196,7 @@ synchronize_rcu()
any that begin after synchronize_rcu() is invoked. any that begin after synchronize_rcu() is invoked.
Of course, synchronize_rcu() does not necessarily return Of course, synchronize_rcu() does not necessarily return
-immediately- after the last pre-existing RCU read-side critical **immediately** after the last pre-existing RCU read-side critical
section completes. For one thing, there might well be scheduling section completes. For one thing, there might well be scheduling
delays. For another thing, many RCU implementations process delays. For another thing, many RCU implementations process
requests in batches in order to improve efficiencies, which can requests in batches in order to improve efficiencies, which can
@@ -211,10 +225,10 @@ synchronize_rcu()
checklist.txt for some approaches to limiting the update rate. checklist.txt for some approaches to limiting the update rate.
rcu_assign_pointer() rcu_assign_pointer()
^^^^^^^^^^^^^^^^^^^^
void rcu_assign_pointer(p, typeof(p) v); void rcu_assign_pointer(p, typeof(p) v);
Yes, rcu_assign_pointer() -is- implemented as a macro, though it Yes, rcu_assign_pointer() **is** implemented as a macro, though it
would be cool to be able to declare a function in this manner. would be cool to be able to declare a function in this manner.
(Compiler experts will no doubt disagree.) (Compiler experts will no doubt disagree.)
@@ -231,7 +245,7 @@ rcu_assign_pointer()
the _rcu list-manipulation primitives such as list_add_rcu(). the _rcu list-manipulation primitives such as list_add_rcu().
rcu_dereference() rcu_dereference()
^^^^^^^^^^^^^^^^^
typeof(p) rcu_dereference(p); typeof(p) rcu_dereference(p);
Like rcu_assign_pointer(), rcu_dereference() must be implemented Like rcu_assign_pointer(), rcu_dereference() must be implemented
@@ -248,13 +262,13 @@ rcu_dereference()
Common coding practice uses rcu_dereference() to copy an Common coding practice uses rcu_dereference() to copy an
RCU-protected pointer to a local variable, then dereferences RCU-protected pointer to a local variable, then dereferences
this local variable, for example as follows: this local variable, for example as follows::
p = rcu_dereference(head.next); p = rcu_dereference(head.next);
return p->data; return p->data;
However, in this case, one could just as easily combine these However, in this case, one could just as easily combine these
into one statement: into one statement::
return rcu_dereference(head.next)->data; return rcu_dereference(head.next)->data;
@@ -266,8 +280,8 @@ rcu_dereference()
unnecessary overhead on Alpha CPUs. unnecessary overhead on Alpha CPUs.
Note that the value returned by rcu_dereference() is valid Note that the value returned by rcu_dereference() is valid
only within the enclosing RCU read-side critical section [1]. only within the enclosing RCU read-side critical section [1]_.
For example, the following is -not- legal: For example, the following is **not** legal::
rcu_read_lock(); rcu_read_lock();
p = rcu_dereference(head.next); p = rcu_dereference(head.next);
@@ -290,9 +304,9 @@ rcu_dereference()
at any time, including immediately after the rcu_dereference(). at any time, including immediately after the rcu_dereference().
And, again like rcu_assign_pointer(), rcu_dereference() is And, again like rcu_assign_pointer(), rcu_dereference() is
typically used indirectly, via the _rcu list-manipulation typically used indirectly, via the _rcu list-manipulation
primitives, such as list_for_each_entry_rcu() [2]. primitives, such as list_for_each_entry_rcu() [2]_.
[1] The variant rcu_dereference_protected() can be used outside .. [1] The variant rcu_dereference_protected() can be used outside
of an RCU read-side critical section as long as the usage is of an RCU read-side critical section as long as the usage is
protected by locks acquired by the update-side code. This variant protected by locks acquired by the update-side code. This variant
avoids the lockdep warning that would happen when using (for avoids the lockdep warning that would happen when using (for
@@ -305,7 +319,7 @@ rcu_dereference()
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
and the API's code comments for more details and example usage. and the API's code comments for more details and example usage.
[2] If the list_for_each_entry_rcu() instance might be used by .. [2] If the list_for_each_entry_rcu() instance might be used by
update-side code as well as by RCU readers, then an additional update-side code as well as by RCU readers, then an additional
lockdep expression can be added to its list of arguments. lockdep expression can be added to its list of arguments.
For example, given an additional "lock_is_held(&mylock)" argument, For example, given an additional "lock_is_held(&mylock)" argument,
@@ -315,6 +329,7 @@ rcu_dereference()
The following diagram shows how each API communicates among the The following diagram shows how each API communicates among the
reader, updater, and reclaimer. reader, updater, and reclaimer.
::
rcu_assign_pointer() rcu_assign_pointer()
@@ -375,12 +390,16 @@ c. RCU applied to scheduler and interrupt/NMI-handler tasks.
Again, most uses will be of (a). The (b) and (c) cases are important Again, most uses will be of (a). The (b) and (c) cases are important
for specialized uses, but are relatively uncommon. for specialized uses, but are relatively uncommon.
.. _3_whatisRCU:
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? 3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
-----------------------------------------------
This section shows a simple use of the core RCU API to protect a This section shows a simple use of the core RCU API to protect a
global pointer to a dynamically allocated structure. More-typical global pointer to a dynamically allocated structure. More-typical
uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
::
struct foo { struct foo {
int a; int a;
@@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
So, to sum up: So, to sum up:
o Use rcu_read_lock() and rcu_read_unlock() to guard RCU - Use rcu_read_lock() and rcu_read_unlock() to guard RCU
read-side critical sections. read-side critical sections.
o Within an RCU read-side critical section, use rcu_dereference() - Within an RCU read-side critical section, use rcu_dereference()
to dereference RCU-protected pointers. to dereference RCU-protected pointers.
o Use some solid scheme (such as locks or semaphores) to - Use some solid scheme (such as locks or semaphores) to
keep concurrent updates from interfering with each other. keep concurrent updates from interfering with each other.
o Use rcu_assign_pointer() to update an RCU-protected pointer. - Use rcu_assign_pointer() to update an RCU-protected pointer.
This primitive protects concurrent readers from the updater, This primitive protects concurrent readers from the updater,
-not- concurrent updates from each other! You therefore still **not** concurrent updates from each other! You therefore still
need to use locking (or something similar) to keep concurrent need to use locking (or something similar) to keep concurrent
rcu_assign_pointer() primitives from interfering with each other. rcu_assign_pointer() primitives from interfering with each other.
o Use synchronize_rcu() -after- removing a data element from an - Use synchronize_rcu() **after** removing a data element from an
RCU-protected data structure, but -before- reclaiming/freeing RCU-protected data structure, but **before** reclaiming/freeing
the data element, in order to wait for the completion of all the data element, in order to wait for the completion of all
RCU read-side critical sections that might be referencing that RCU read-side critical sections that might be referencing that
data item. data item.
See checklist.txt for additional rules to follow when using RCU. See checklist.txt for additional rules to follow when using RCU.
And again, more-typical uses of RCU may be found in listRCU.txt, And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
arrayRCU.txt, and NMI-RCU.txt. <list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
<NMI_rcu_doc>`.
.. _4_whatisRCU:
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? 4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
--------------------------------------------
In the example above, foo_update_a() blocks until a grace period elapses. In the example above, foo_update_a() blocks until a grace period elapses.
This is quite simple, but in some cases one cannot afford to wait so This is quite simple, but in some cases one cannot afford to wait so
long -- there might be other high-priority work to be done. long -- there might be other high-priority work to be done.
In such cases, one uses call_rcu() rather than synchronize_rcu(). In such cases, one uses call_rcu() rather than synchronize_rcu().
The call_rcu() API is as follows: The call_rcu() API is as follows::
void call_rcu(struct rcu_head * head, void call_rcu(struct rcu_head * head,
void (*func)(struct rcu_head *head)); void (*func)(struct rcu_head *head));
@@ -481,7 +503,7 @@ The call_rcu() API is as follows:
This function invokes func(head) after a grace period has elapsed. This function invokes func(head) after a grace period has elapsed.
This invocation might happen from either softirq or process context, This invocation might happen from either softirq or process context,
so the function is not permitted to block. The foo struct needs to so the function is not permitted to block. The foo struct needs to
have an rcu_head structure added, perhaps as follows: have an rcu_head structure added, perhaps as follows::
struct foo { struct foo {
int a; int a;
@@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows:
struct rcu_head rcu; struct rcu_head rcu;
}; };
The foo_update_a() function might then be written as follows: The foo_update_a() function might then be written as follows::
/* /*
* Create a new struct foo that is the same as the one currently * Create a new struct foo that is the same as the one currently
@@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows:
call_rcu(&old_fp->rcu, foo_reclaim); call_rcu(&old_fp->rcu, foo_reclaim);
} }
The foo_reclaim() function might appear as follows: The foo_reclaim() function might appear as follows::
void foo_reclaim(struct rcu_head *rp) void foo_reclaim(struct rcu_head *rp)
{ {
@@ -544,7 +566,7 @@ namely foo_reclaim().
The summary of advice is the same as for the previous section, except The summary of advice is the same as for the previous section, except
that we are now using call_rcu() rather than synchronize_rcu(): that we are now using call_rcu() rather than synchronize_rcu():
o Use call_rcu() -after- removing a data element from an - Use call_rcu() **after** removing a data element from an
RCU-protected data structure in order to register a callback RCU-protected data structure in order to register a callback
function that will be invoked after the completion of all RCU function that will be invoked after the completion of all RCU
read-side critical sections that might be referencing that read-side critical sections that might be referencing that
@@ -552,14 +574,16 @@ o Use call_rcu() -after- removing a data element from an
If the callback for call_rcu() is not doing anything more than calling If the callback for call_rcu() is not doing anything more than calling
kfree() on the structure, you can use kfree_rcu() instead of call_rcu() kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
to avoid having to write your own callback: to avoid having to write your own callback::
kfree_rcu(old_fp, rcu); kfree_rcu(old_fp, rcu);
Again, see checklist.txt for additional rules governing the use of RCU. Again, see checklist.txt for additional rules governing the use of RCU.
.. _5_whatisRCU:
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? 5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
------------------------------------------------
One of the nice things about RCU is that it has extremely simple "toy" One of the nice things about RCU is that it has extremely simple "toy"
implementations that are a good first step towards understanding the implementations that are a good first step towards understanding the
@@ -579,7 +603,7 @@ more details on the current implementation as of early 2004.
5A. "TOY" IMPLEMENTATION #1: LOCKING 5A. "TOY" IMPLEMENTATION #1: LOCKING
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This section presents a "toy" RCU implementation that is based on This section presents a "toy" RCU implementation that is based on
familiar locking primitives. Its overhead makes it a non-starter for familiar locking primitives. Its overhead makes it a non-starter for
real-life use, as does its lack of scalability. It is also unsuitable real-life use, as does its lack of scalability. It is also unsuitable
@@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock.
However, it is probably the easiest implementation to relate to, so is However, it is probably the easiest implementation to relate to, so is
a good starting point. a good starting point.
It is extremely simple: It is extremely simple::
static DEFINE_RWLOCK(rcu_gp_mutex); static DEFINE_RWLOCK(rcu_gp_mutex);
@@ -614,7 +638,7 @@ It is extremely simple:
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing [You can ignore rcu_assign_pointer() and rcu_dereference() without missing
much. But here are simplified versions anyway. And whatever you do, much. But here are simplified versions anyway. And whatever you do,
don't forget about them when submitting patches making use of RCU!] don't forget about them when submitting patches making use of RCU!]::
#define rcu_assign_pointer(p, v) \ #define rcu_assign_pointer(p, v) \
({ \ ({ \
@@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu().
But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
so there can be no deadlock cycle. so there can be no deadlock cycle.
Quick Quiz #1: Why is this argument naive? How could a deadlock .. _quiz_1:
Quick Quiz #1:
Why is this argument naive? How could a deadlock
occur when using this algorithm in a real-world Linux occur when using this algorithm in a real-world Linux
kernel? How could this deadlock be avoided? kernel? How could this deadlock be avoided?
:ref:`Answers to Quick Quiz <8_whatisRCU>`
5B. "TOY" EXAMPLE #2: CLASSIC RCU 5B. "TOY" EXAMPLE #2: CLASSIC RCU
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This section presents a "toy" RCU implementation that is based on This section presents a "toy" RCU implementation that is based on
"classic RCU". It is also short on performance (but only for updates) and "classic RCU". It is also short on performance (but only for updates) and
on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
kernels. The definitions of rcu_dereference() and rcu_assign_pointer() kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
are the same as those shown in the preceding section, so they are omitted. are the same as those shown in the preceding section, so they are omitted.
::
void rcu_read_lock(void) { } void rcu_read_lock(void) { }
@@ -683,14 +712,14 @@ CPU in turn. The run_on() primitive can be implemented straightforwardly
in terms of the sched_setaffinity() primitive. Of course, a somewhat less in terms of the sched_setaffinity() primitive. Of course, a somewhat less
"toy" implementation would restore the affinity upon completion rather "toy" implementation would restore the affinity upon completion rather
than just leaving all tasks running on the last CPU, but when I said than just leaving all tasks running on the last CPU, but when I said
"toy", I meant -toy-! "toy", I meant **toy**!
So how the heck is this supposed to work??? So how the heck is this supposed to work???
Remember that it is illegal to block while in an RCU read-side critical Remember that it is illegal to block while in an RCU read-side critical
section. Therefore, if a given CPU executes a context switch, we know section. Therefore, if a given CPU executes a context switch, we know
that it must have completed all preceding RCU read-side critical sections. that it must have completed all preceding RCU read-side critical sections.
Once -all- CPUs have executed a context switch, then -all- preceding Once **all** CPUs have executed a context switch, then **all** preceding
RCU read-side critical sections will have completed. RCU read-side critical sections will have completed.
So, suppose that we remove a data item from its structure and then invoke So, suppose that we remove a data item from its structure and then invoke
@@ -698,19 +727,32 @@ synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
that there are no RCU read-side critical sections holding a reference that there are no RCU read-side critical sections holding a reference
to that data item, so we can safely reclaim it. to that data item, so we can safely reclaim it.
Quick Quiz #2: Give an example where Classic RCU's read-side .. _quiz_2:
overhead is -negative-.
Quick Quiz #3: If it is illegal to block in an RCU read-side Quick Quiz #2:
Give an example where Classic RCU's read-side
overhead is **negative**.
:ref:`Answers to Quick Quiz <8_whatisRCU>`
.. _quiz_3:
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in critical section, what the heck do you do in
PREEMPT_RT, where normal spinlocks can block??? PREEMPT_RT, where normal spinlocks can block???
:ref:`Answers to Quick Quiz <8_whatisRCU>`
.. _6_whatisRCU:
6. ANALOGY WITH READER-WRITER LOCKING 6. ANALOGY WITH READER-WRITER LOCKING
--------------------------------------
Although RCU can be used in many different ways, a very common use of Although RCU can be used in many different ways, a very common use of
RCU is analogous to reader-writer locking. The following unified RCU is analogous to reader-writer locking. The following unified
diff shows how closely related RCU and reader-writer locking can be. diff shows how closely related RCU and reader-writer locking can be.
::
@@ -5,5 +5,5 @@ struct el { @@ -5,5 +5,5 @@ struct el {
int data; int data;
@@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be.
return 0; return 0;
} }
Or, for those who prefer a side-by-side listing: Or, for those who prefer a side-by-side listing::
1 struct el { 1 struct el { 1 struct el { 1 struct el {
2 struct list_head list; 2 struct list_head list; 2 struct list_head list; 2 struct list_head list;
@@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing:
8 rwlock_t listmutex; 8 spinlock_t listmutex; 8 rwlock_t listmutex; 8 spinlock_t listmutex;
9 struct el head; 9 struct el head; 9 struct el head; 9 struct el head;
1 int search(long key, int *result) 1 int search(long key, int *result) ::
2 { 2 {
3 struct list_head *lp; 3 struct list_head *lp;
4 struct el *p; 4 struct el *p;
5 5
6 read_lock(&listmutex); 6 rcu_read_lock();
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
8 if (p->key == key) { 8 if (p->key == key) {
9 *result = p->data; 9 *result = p->data;
10 read_unlock(&listmutex); 10 rcu_read_unlock();
11 return 1; 11 return 1;
12 } 12 }
13 } 13 }
14 read_unlock(&listmutex); 14 rcu_read_unlock();
15 return 0; 15 return 0;
16 } 16 }
1 int delete(long key) 1 int delete(long key) 1 int search(long key, int *result) 1 int search(long key, int *result)
2 { 2 { 2 { 2 {
3 struct el *p; 3 struct el *p; 3 struct list_head *lp; 3 struct list_head *lp;
4 4 4 struct el *p; 4 struct el *p;
5 write_lock(&listmutex); 5 spin_lock(&listmutex); 5 5
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { 6 read_lock(&listmutex); 6 rcu_read_lock();
7 if (p->key == key) { 7 if (p->key == key) { 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
8 list_del(&p->list); 8 list_del_rcu(&p->list); 8 if (p->key == key) { 8 if (p->key == key) {
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); 9 *result = p->data; 9 *result = p->data;
10 synchronize_rcu(); 10 read_unlock(&listmutex); 10 rcu_read_unlock();
10 kfree(p); 11 kfree(p); 11 return 1; 11 return 1;
11 return 1; 12 return 1; 12 } 12 }
12 } 13 } 13 } 13 }
13 } 14 } 14 read_unlock(&listmutex); 14 rcu_read_unlock();
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); 15 return 0; 15 return 0;
15 return 0; 16 return 0; 16 } 16 }
16 } 17 }
::
1 int delete(long key) 1 int delete(long key)
2 { 2 {
3 struct el *p; 3 struct el *p;
4 4
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
7 if (p->key == key) { 7 if (p->key == key) {
8 list_del(&p->list); 8 list_del_rcu(&p->list);
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
10 synchronize_rcu();
10 kfree(p); 11 kfree(p);
11 return 1; 12 return 1;
12 } 13 }
13 } 14 }
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
15 return 0; 16 return 0;
16 } 17 }
Either way, the differences are quite small. Read-side locking moves Either way, the differences are quite small. Read-side locking moves
to rcu_read_lock() and rcu_read_unlock, update-side locking moves from to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
@@ -825,22 +871,27 @@ delete() can now block. If this is a problem, there is a callback-based
mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
be used in place of synchronize_rcu(). be used in place of synchronize_rcu().
.. _7_whatisRCU:
7. FULL LIST OF RCU APIs 7. FULL LIST OF RCU APIs
-------------------------
The RCU APIs are documented in docbook-format header comments in the The RCU APIs are documented in docbook-format header comments in the
Linux-kernel source code, but it helps to have a full list of the Linux-kernel source code, but it helps to have a full list of the
APIs, since there does not appear to be a way to categorize them APIs, since there does not appear to be a way to categorize them
in docbook. Here is the list, by category. in docbook. Here is the list, by category.
RCU list traversal: RCU list traversal::
list_entry_rcu list_entry_rcu
list_entry_lockless
list_first_entry_rcu list_first_entry_rcu
list_next_rcu list_next_rcu
list_for_each_entry_rcu list_for_each_entry_rcu
list_for_each_entry_continue_rcu list_for_each_entry_continue_rcu
list_for_each_entry_from_rcu list_for_each_entry_from_rcu
list_first_or_null_rcu
list_next_or_null_rcu
hlist_first_rcu hlist_first_rcu
hlist_next_rcu hlist_next_rcu
hlist_pprev_rcu hlist_pprev_rcu
@@ -854,7 +905,7 @@ RCU list traversal:
hlist_bl_first_rcu hlist_bl_first_rcu
hlist_bl_for_each_entry_rcu hlist_bl_for_each_entry_rcu
RCU pointer/list update: RCU pointer/list update::
rcu_assign_pointer rcu_assign_pointer
list_add_rcu list_add_rcu
@@ -864,10 +915,12 @@ RCU pointer/list update:
hlist_add_behind_rcu hlist_add_behind_rcu
hlist_add_before_rcu hlist_add_before_rcu
hlist_add_head_rcu hlist_add_head_rcu
hlist_add_tail_rcu
hlist_del_rcu hlist_del_rcu
hlist_del_init_rcu hlist_del_init_rcu
hlist_replace_rcu hlist_replace_rcu
list_splice_init_rcu() list_splice_init_rcu
list_splice_tail_init_rcu
hlist_nulls_del_init_rcu hlist_nulls_del_init_rcu
hlist_nulls_del_rcu hlist_nulls_del_rcu
hlist_nulls_add_head_rcu hlist_nulls_add_head_rcu
@@ -876,7 +929,9 @@ RCU pointer/list update:
hlist_bl_del_rcu hlist_bl_del_rcu
hlist_bl_set_first_rcu hlist_bl_set_first_rcu
RCU: Critical sections Grace period Barrier RCU::
Critical sections Grace period Barrier
rcu_read_lock synchronize_net rcu_barrier rcu_read_lock synchronize_net rcu_barrier
rcu_read_unlock synchronize_rcu rcu_read_unlock synchronize_rcu
@@ -885,7 +940,9 @@ RCU: Critical sections Grace period Barrier
rcu_dereference_check kfree_rcu rcu_dereference_check kfree_rcu
rcu_dereference_protected rcu_dereference_protected
bh: Critical sections Grace period Barrier bh::
Critical sections Grace period Barrier
rcu_read_lock_bh call_rcu rcu_barrier rcu_read_lock_bh call_rcu rcu_barrier
rcu_read_unlock_bh synchronize_rcu rcu_read_unlock_bh synchronize_rcu
@@ -896,7 +953,9 @@ bh: Critical sections Grace period Barrier
rcu_dereference_bh_protected rcu_dereference_bh_protected
rcu_read_lock_bh_held rcu_read_lock_bh_held
sched: Critical sections Grace period Barrier sched::
Critical sections Grace period Barrier
rcu_read_lock_sched call_rcu rcu_barrier rcu_read_lock_sched call_rcu rcu_barrier
rcu_read_unlock_sched synchronize_rcu rcu_read_unlock_sched synchronize_rcu
@@ -910,7 +969,9 @@ sched: Critical sections Grace period Barrier
rcu_read_lock_sched_held rcu_read_lock_sched_held
SRCU: Critical sections Grace period Barrier SRCU::
Critical sections Grace period Barrier
srcu_read_lock call_srcu srcu_barrier srcu_read_lock call_srcu srcu_barrier
srcu_read_unlock synchronize_srcu srcu_read_unlock synchronize_srcu
@@ -918,13 +979,14 @@ SRCU: Critical sections Grace period Barrier
srcu_dereference_check srcu_dereference_check
srcu_read_lock_held srcu_read_lock_held
SRCU: Initialization/cleanup SRCU: Initialization/cleanup::
DEFINE_SRCU DEFINE_SRCU
DEFINE_STATIC_SRCU DEFINE_STATIC_SRCU
init_srcu_struct init_srcu_struct
cleanup_srcu_struct cleanup_srcu_struct
All: lockdep-checked RCU-protected pointer access All: lockdep-checked RCU-protected pointer access::
rcu_access_pointer rcu_access_pointer
rcu_dereference_raw rcu_dereference_raw
@@ -974,15 +1036,19 @@ g. Otherwise, use RCU.
Of course, this all assumes that you have determined that RCU is in fact Of course, this all assumes that you have determined that RCU is in fact
the right tool for your job. the right tool for your job.
.. _8_whatisRCU:
8. ANSWERS TO QUICK QUIZZES 8. ANSWERS TO QUICK QUIZZES
----------------------------
Quick Quiz #1: Why is this argument naive? How could a deadlock Quick Quiz #1:
Why is this argument naive? How could a deadlock
occur when using this algorithm in a real-world Linux occur when using this algorithm in a real-world Linux
kernel? [Referring to the lock-based "toy" RCU kernel? [Referring to the lock-based "toy" RCU
algorithm.] algorithm.]
Answer: Consider the following sequence of events: Answer:
Consider the following sequence of events:
1. CPU 0 acquires some unrelated lock, call it 1. CPU 0 acquires some unrelated lock, call it
"problematic_lock", disabling irq via "problematic_lock", disabling irq via
@@ -1021,10 +1087,14 @@ Answer: Consider the following sequence of events:
approach where tasks in RCU read-side critical sections approach where tasks in RCU read-side critical sections
cannot be blocked by tasks executing synchronize_rcu(). cannot be blocked by tasks executing synchronize_rcu().
Quick Quiz #2: Give an example where Classic RCU's read-side :ref:`Back to Quick Quiz #1 <quiz_1>`
overhead is -negative-.
Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT Quick Quiz #2:
Give an example where Classic RCU's read-side
overhead is **negative**.
Answer:
Imagine a single-CPU system with a non-CONFIG_PREEMPT
kernel where a routing table is used by process-context kernel where a routing table is used by process-context
code, but can be updated by irq-context code (for example, code, but can be updated by irq-context code (for example,
by an "ICMP REDIRECT" packet). The usual way of handling by an "ICMP REDIRECT" packet). The usual way of handling
@@ -1046,11 +1116,15 @@ Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
even the theoretical possibility of negative overhead for even the theoretical possibility of negative overhead for
a synchronization primitive is a bit unexpected. ;-) a synchronization primitive is a bit unexpected. ;-)
Quick Quiz #3: If it is illegal to block in an RCU read-side :ref:`Back to Quick Quiz #2 <quiz_2>`
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in critical section, what the heck do you do in
PREEMPT_RT, where normal spinlocks can block??? PREEMPT_RT, where normal spinlocks can block???
Answer: Just as PREEMPT_RT permits preemption of spinlock Answer:
Just as PREEMPT_RT permits preemption of spinlock
critical sections, it permits preemption of RCU critical sections, it permits preemption of RCU
read-side critical sections. It also permits read-side critical sections. It also permits
spinlocks blocking while in RCU read-side critical spinlocks blocking while in RCU read-side critical
@@ -1069,6 +1143,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
Besides, how does the computer know what pizza parlor Besides, how does the computer know what pizza parlor
the human being went to??? the human being went to???
:ref:`Back to Quick Quiz #3 <quiz_3>`
ACKNOWLEDGEMENTS ACKNOWLEDGEMENTS

View File

@@ -0,0 +1,62 @@
.. SPDX-License-Identifier: GPL-2.0
===========================
ACPI Fan Performance States
===========================
When the optional _FPS object is present under an ACPI device representing a
fan (for example, PNP0C0B or INT3404), the ACPI fan driver creates additional
"state*" attributes in the sysfs directory of the ACPI device in question.
These attributes list properties of fan performance states.
For more information on _FPS refer to the ACPI specification at:
http://uefi.org/specifications
For instance, the contents of the INT3404 ACPI device sysfs directory
may look as follows::
$ ls -l /sys/bus/acpi/devices/INT3404:00/
total 0
...
-r--r--r-- 1 root root 4096 Dec 13 20:38 state0
-r--r--r-- 1 root root 4096 Dec 13 20:38 state1
-r--r--r-- 1 root root 4096 Dec 13 20:38 state10
-r--r--r-- 1 root root 4096 Dec 13 20:38 state11
-r--r--r-- 1 root root 4096 Dec 13 20:38 state2
-r--r--r-- 1 root root 4096 Dec 13 20:38 state3
-r--r--r-- 1 root root 4096 Dec 13 20:38 state4
-r--r--r-- 1 root root 4096 Dec 13 20:38 state5
-r--r--r-- 1 root root 4096 Dec 13 20:38 state6
-r--r--r-- 1 root root 4096 Dec 13 20:38 state7
-r--r--r-- 1 root root 4096 Dec 13 20:38 state8
-r--r--r-- 1 root root 4096 Dec 13 20:38 state9
-r--r--r-- 1 root root 4096 Dec 13 01:00 status
...
where each of the "state*" files represents one performance state of the fan
and contains a colon-separated list of 5 integer numbers (fields) with the
following interpretation::
control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
* ``control_percent``: The percent value to be used to set the fan speed to a
specific level using the _FSL object (0-100).
* ``trip_point_index``: The active cooling trip point number that corresponds
to this performance state (0-9).
* ``speed_rpm``: Speed of the fan in rotations per minute.
* ``noise_level_mdb``: Audible noise emitted by the fan in this state in
millidecibels.
* ``power_mw``: Power draw of the fan in this state in milliwatts.
For example::
$cat /sys/bus/acpi/devices/INT3404:00/state1
25:0:3200:12500:1250
When a given field is not populated or its value provided by the platform
firmware is invalid, the "not-defined" string is shown instead of the value.

View File

@@ -12,3 +12,4 @@ the Linux ACPI support.
dsdt-override dsdt-override
ssdt-overlays ssdt-overlays
cppc_sysfs cppc_sysfs
fan_performance_states

View File

@@ -1,15 +1,15 @@
======================================== ========================================
zram: Compressed RAM based block devices zram: Compressed RAM-based block devices
======================================== ========================================
Introduction Introduction
============ ============
The zram module creates RAM based block devices named /dev/zram<id> The zram module creates RAM-based block devices named /dev/zram<id>
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored (<id> = 0, 1, ...). Pages written to these disks are compressed and stored
in memory itself. These disks allow very fast I/O and compression provides in memory itself. These disks allow very fast I/O and compression provides
good amounts of memory savings. Some of the usecases include /tmp storage, good amounts of memory savings. Some of the use cases include /tmp storage,
use as swap disks, various caches under /var and maybe many more :) use as swap disks, various caches under /var and maybe many more. :)
Statistics for individual zram devices are exported through sysfs nodes at Statistics for individual zram devices are exported through sysfs nodes at
/sys/block/zram<id>/ /sys/block/zram<id>/
@@ -43,17 +43,17 @@ The list of possible return codes:
======== ============================================================= ======== =============================================================
-EBUSY an attempt to modify an attribute that cannot be changed once -EBUSY an attempt to modify an attribute that cannot be changed once
the device has been initialised. Please reset device first; the device has been initialised. Please reset device first.
-ENOMEM zram was not able to allocate enough memory to fulfil your -ENOMEM zram was not able to allocate enough memory to fulfil your
needs; needs.
-EINVAL invalid input has been provided. -EINVAL invalid input has been provided.
======== ============================================================= ======== =============================================================
If you use 'echo', the returned value that is changed by 'echo' utility, If you use 'echo', the returned value is set by the 'echo' utility,
and, in general case, something like:: and, in general case, something like::
echo 3 > /sys/block/zram0/max_comp_streams echo 3 > /sys/block/zram0/max_comp_streams
if [ $? -ne 0 ]; if [ $? -ne 0 ]; then
handle_error handle_error
fi fi
@@ -65,7 +65,8 @@ should suffice.
:: ::
modprobe zram num_devices=4 modprobe zram num_devices=4
This creates 4 devices: /dev/zram{0,1,2,3}
This creates 4 devices: /dev/zram{0,1,2,3}
num_devices parameter is optional and tells zram how many devices should be num_devices parameter is optional and tells zram how many devices should be
pre-created. Default: 1. pre-created. Default: 1.
@@ -73,12 +74,12 @@ pre-created. Default: 1.
2) Set max number of compression streams 2) Set max number of compression streams
======================================== ========================================
Regardless the value passed to this attribute, ZRAM will always Regardless of the value passed to this attribute, ZRAM will always
allocate multiple compression streams - one per online CPUs - thus allocate multiple compression streams - one per online CPU - thus
allowing several concurrent compression operations. The number of allowing several concurrent compression operations. The number of
allocated compression streams goes down when some of the CPUs allocated compression streams goes down when some of the CPUs
become offline. There is no single-compression-stream mode anymore, become offline. There is no single-compression-stream mode anymore,
unless you are running a UP system or has only 1 CPU online. unless you are running a UP system or have only 1 CPU online.
To find out how many streams are currently available:: To find out how many streams are currently available::
@@ -89,7 +90,7 @@ To find out how many streams are currently available::
Using comp_algorithm device attribute one can see available and Using comp_algorithm device attribute one can see available and
currently selected (shown in square brackets) compression algorithms, currently selected (shown in square brackets) compression algorithms,
change selected compression algorithm (once the device is initialised or change the selected compression algorithm (once the device is initialised
there is no way to change compression algorithm). there is no way to change compression algorithm).
Examples:: Examples::
@@ -167,9 +168,9 @@ Examples::
zram provides a control interface, which enables dynamic (on-demand) device zram provides a control interface, which enables dynamic (on-demand) device
addition and removal. addition and removal.
In order to add a new /dev/zramX device, perform read operation on hot_add In order to add a new /dev/zramX device, perform a read operation on the hot_add
attribute. This will return either new device's device id (meaning that you attribute. This will return either the new device's device id (meaning that you
can use /dev/zram<id>) or error code. can use /dev/zram<id>) or an error code.
Example:: Example::
@@ -186,8 +187,8 @@ execute::
Per-device statistics are exported as various nodes under /sys/block/zram<id>/ Per-device statistics are exported as various nodes under /sys/block/zram<id>/
A brief description of exported device attributes. For more details please A brief description of exported device attributes follows. For more details
read Documentation/ABI/testing/sysfs-block-zram. please read Documentation/ABI/testing/sysfs-block-zram.
====================== ====== =============================================== ====================== ====== ===============================================
Name access description Name access description
@@ -245,7 +246,7 @@ whitespace:
File /sys/block/zram<id>/mm_stat File /sys/block/zram<id>/mm_stat
The stat file represents device's mm statistics. It consists of a single The mm_stat file represents the device's mm statistics. It consists of a single
line of text and contains the following stats separated by whitespace: line of text and contains the following stats separated by whitespace:
================ ============================================================= ================ =============================================================
@@ -261,7 +262,7 @@ line of text and contains the following stats separated by whitespace:
Unit: bytes Unit: bytes
mem_limit the maximum amount of memory ZRAM can use to store mem_limit the maximum amount of memory ZRAM can use to store
the compressed data the compressed data
mem_used_max the maximum amount of memory zram have consumed to mem_used_max the maximum amount of memory zram has consumed to
store the data store the data
same_pages the number of same element filled pages written to this disk. same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages. No memory is allocated for such pages.
@@ -271,7 +272,7 @@ line of text and contains the following stats separated by whitespace:
File /sys/block/zram<id>/bd_stat File /sys/block/zram<id>/bd_stat
The stat file represents device's backing device statistics. It consists of The bd_stat file represents a device's backing device statistics. It consists of
a single line of text and contains the following stats separated by whitespace: a single line of text and contains the following stats separated by whitespace:
============== ============================================================= ============== =============================================================
@@ -316,9 +317,9 @@ To use the feature, admin should set up backing device via::
echo /dev/sda5 > /sys/block/zramX/backing_dev echo /dev/sda5 > /sys/block/zramX/backing_dev
before disksize setting. It supports only partition at this moment. before disksize setting. It supports only partition at this moment.
If admin want to use incompressible page writeback, they could do via:: If admin wants to use incompressible page writeback, they could do via::
echo huge > /sys/block/zramX/write echo huge > /sys/block/zramX/writeback
To use idle page writeback, first, user need to declare zram pages To use idle page writeback, first, user need to declare zram pages
as idle:: as idle::
@@ -326,7 +327,7 @@ as idle::
echo all > /sys/block/zramX/idle echo all > /sys/block/zramX/idle
From now on, any pages on zram are idle pages. The idle mark From now on, any pages on zram are idle pages. The idle mark
will be removed until someone request access of the block. will be removed until someone requests access of the block.
IOW, unless there is access request, those pages are still idle pages. IOW, unless there is access request, those pages are still idle pages.
Admin can request writeback of those idle pages at right timing via:: Admin can request writeback of those idle pages at right timing via::
@@ -341,16 +342,16 @@ to guarantee storage health for entire product life.
To overcome the concern, zram supports "writeback_limit" feature. To overcome the concern, zram supports "writeback_limit" feature.
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
any writeback. IOW, if admin want to apply writeback budget, he should any writeback. IOW, if admin wants to apply writeback budget, he should
enable writeback_limit_enable via:: enable writeback_limit_enable via::
$ echo 1 > /sys/block/zramX/writeback_limit_enable $ echo 1 > /sys/block/zramX/writeback_limit_enable
Once writeback_limit_enable is set, zram doesn't allow any writeback Once writeback_limit_enable is set, zram doesn't allow any writeback
until admin set the budget via /sys/block/zramX/writeback_limit. until admin sets the budget via /sys/block/zramX/writeback_limit.
(If admin doesn't enable writeback_limit_enable, writeback_limit's value (If admin doesn't enable writeback_limit_enable, writeback_limit's value
assigned via /sys/block/zramX/writeback_limit is meaninless.) assigned via /sys/block/zramX/writeback_limit is meaningless.)
If admin want to limit writeback as per-day 400M, he could do it If admin want to limit writeback as per-day 400M, he could do it
like below:: like below::
@@ -361,13 +362,13 @@ like below::
/sys/block/zram0/writeback_limit. /sys/block/zram0/writeback_limit.
$ echo 1 > /sys/block/zram0/writeback_limit_enable $ echo 1 > /sys/block/zram0/writeback_limit_enable
If admin want to allow further write again once the bugdet is exausted, If admins want to allow further write again once the bugdet is exhausted,
he could do it like below:: he could do it like below::
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \ $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
/sys/block/zram0/writeback_limit /sys/block/zram0/writeback_limit
If admin want to see remaining writeback budget since he set:: If admin wants to see remaining writeback budget since last set::
$ cat /sys/block/zramX/writeback_limit $ cat /sys/block/zramX/writeback_limit
@@ -375,12 +376,12 @@ If admin want to disable writeback limit, he could do::
$ echo 0 > /sys/block/zramX/writeback_limit_enable $ echo 0 > /sys/block/zramX/writeback_limit_enable
The writeback_limit count will reset whenever you reset zram(e.g., The writeback_limit count will reset whenever you reset zram (e.g.,
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
writeback happened until you reset the zram to allocate extra writeback writeback happened until you reset the zram to allocate extra writeback
budget in next setting is user's job. budget in next setting is user's job.
If admin want to measure writeback count in a certain period, he could If admin wants to measure writeback count in a certain period, he could
know it via /sys/block/zram0/bd_stat's 3rd column. know it via /sys/block/zram0/bd_stat's 3rd column.
memory tracking memory tracking

View File

@@ -0,0 +1,218 @@
.. SPDX-License-Identifier: GPL-2.0
.. _bootconfig:
==================
Boot Configuration
==================
:Author: Masami Hiramatsu <mhiramat@kernel.org>
Overview
========
The boot configuration expands the current kernel command line to support
additional key-value data when booting the kernel in an efficient way.
This allows administrators to pass a structured-Key config file.
Config File Syntax
==================
The boot config syntax is a simple structured key-value. Each key consists
of dot-connected-words, and key and value are connected by ``=``. The value
has to be terminated by semi-colon (``;``) or newline (``\n``).
For array value, array entries are separated by comma (``,``). ::
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
Each key word must contain only alphabets, numbers, dash (``-``) or underscore
(``_``). And each value only contains printable characters or spaces except
for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``),
hash (``#``) and closing brace (``}``).
If you want to use those delimiters in a value, you can use either double-
quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that
you can not escape these quotes.
There can be a key which doesn't have value or has an empty value. Those keys
are used for checking if the key exists or not (like a boolean).
Key-Value Syntax
----------------
The boot config file syntax allows user to merge partially same word keys
by brace. For example::
foo.bar.baz = value1
foo.bar.qux.quux = value2
These can be written also in::
foo.bar {
baz = value1
qux.quux = value2
}
Or more shorter, written as following::
foo.bar { baz = value1; qux.quux = value2 }
In both styles, same key words are automatically merged when parsing it
at boot time. So you can append similar trees or key-values.
Same-key Values
---------------
It is prohibited that two or more values or arrays share a same-key.
For example,::
foo = bar, baz
foo = qux # !ERROR! we can not re-define same key
If you want to append the value to existing key as an array member,
you can use ``+=`` operator. For example::
foo = bar, baz
foo += qux
In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``.
However, a sub-key and a value can not co-exist under a parent key.
For example, following config is NOT allowed.::
foo = value1
foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
Comments
--------
The config syntax accepts shell-script style comments. The comments starting
with hash ("#") until newline ("\n") will be ignored.
::
# comment line
foo = value # value is set to foo.
bar = 1, # 1st element
2, # 2nd element
3 # 3rd element
This is parsed as below::
foo = value
bar = 1, 2, 3
Note that you can not put a comment between value and delimiter(``,`` or
``;``). This means following config has a syntax error ::
key = 1 # comment
,2
/proc/bootconfig
================
/proc/bootconfig is a user-space interface of the boot config.
Unlike /proc/cmdline, this file shows the key-value style list.
Each key-value pair is shown in each line with following style::
KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...]
Boot Kernel With a Boot Config
==============================
Since the boot configuration file is loaded with initrd, it will be added
to the end of the initrd (initramfs) image file with size, checksum and
12-byte magic word as below.
[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
The Linux kernel decodes the last part of the initrd image in memory to
get the boot configuration data.
Because of this "piggyback" method, there is no need to change or
update the boot loader and the kernel image itself.
To do this operation, Linux kernel provides "bootconfig" command under
tools/bootconfig, which allows admin to apply or delete the config file
to/from initrd image. You can build it by the following command::
# make -C tools/bootconfig
To add your boot config file to initrd image, run bootconfig as below
(Old data is removed automatically if exists)::
# tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z
To remove the config from the image, you can use -d option as below::
# tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z
Then add "bootconfig" on the normal kernel command line to tell the
kernel to look for the bootconfig at the end of the initrd file.
Config File Limitation
======================
Currently the maximum config size size is 32KB and the total key-words (not
key-value entries) must be under 1024 nodes.
Note: this is not the number of entries but nodes, an entry must consume
more than 2 nodes (a key-word and a value). So theoretically, it will be
up to 512 key-value pairs. If keys contains 3 words in average, it can
contain 256 key-value pairs. In most cases, the number of config items
will be under 100 entries and smaller than 8KB, so it would be enough.
If the node number exceeds 1024, parser returns an error even if the file
size is smaller than 32KB.
Anyway, since bootconfig command verifies it when appending a boot config
to initrd image, user can notice it before boot.
Bootconfig APIs
===============
User can query or loop on key-value pairs, also it is possible to find
a root (prefix) key node and find key-values under that node.
If you have a key string, you can query the value directly with the key
using xbc_find_value(). If you want to know what keys exist in the boot
config, you can use xbc_for_each_key_value() to iterate key-value pairs.
Note that you need to use xbc_array_for_each_value() for accessing
each array's value, e.g.::
vnode = NULL;
xbc_find_value("key.word", &vnode);
if (vnode && xbc_node_is_array(vnode))
xbc_array_for_each_value(vnode, value) {
printk("%s ", value);
}
If you want to focus on keys which have a prefix string, you can use
xbc_find_node() to find a node by the prefix string, and iterate
keys under the prefix node with xbc_node_for_each_key_value().
But the most typical usage is to get the named value under prefix
or get the named array under prefix as below::
root = xbc_find_node("key.prefix");
value = xbc_node_find_value(root, "option", &vnode);
...
xbc_node_for_each_array_value(root, "array-option", value, anode) {
...
}
This accesses a value of "key.prefix.option" and an array of
"key.prefix.array-option".
Locking is not needed, since after initialization, the config becomes
read-only. All data and keys must be copied if you need to modify it.
Functions and structures
========================
.. kernel-doc:: include/linux/bootconfig.h
.. kernel-doc:: lib/bootconfig.c

View File

@@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/.
5-6. Device 5-6. Device
5-7. RDMA 5-7. RDMA
5-7-1. RDMA Interface Files 5-7-1. RDMA Interface Files
5-8. HugeTLB
5.8-1. HugeTLB Interface Files
5-8. Misc 5-8. Misc
5-8-1. perf_event 5-8-1. perf_event
5-N. Non-normative information 5-N. Non-normative information
@@ -2056,6 +2058,33 @@ RDMA Interface Files
mlx4_0 hca_handle=1 hca_object=20 mlx4_0 hca_handle=1 hca_object=20
ocrdma1 hca_handle=1 hca_object=23 ocrdma1 hca_handle=1 hca_object=23
HugeTLB
-------
The HugeTLB controller allows to limit the HugeTLB usage per control group and
enforces the controller limit during page fault.
HugeTLB Interface Files
~~~~~~~~~~~~~~~~~~~~~~~
hugetlb.<hugepagesize>.current
Show current usage for "hugepagesize" hugetlb. It exists for all
the cgroup except root.
hugetlb.<hugepagesize>.max
Set/show the hard limit of "hugepagesize" hugetlb usage.
The default value is "max". It exists for all the cgroup except root.
hugetlb.<hugepagesize>.events
A read-only flat-keyed file which exists on non-root cgroups.
max
The number of allocation failure due to HugeTLB limit
hugetlb.<hugepagesize>.events.local
Similar to hugetlb.<hugepagesize>.events but the fields in the file
are local to the cgroup i.e. not hierarchical. The file modified event
generated on this file reflects only the local events.
Misc Misc
---- ----

View File

@@ -419,3 +419,5 @@ Version History
rebuild errors. rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap 1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions pages allocated; also fix those not occuring after previous reductions
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
on the status line.

View File

@@ -92,6 +92,8 @@ Currently Available
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
the ordering) the ordering)
* Case-insensitive file name lookups * Case-insensitive file name lookups
* file-based encryption support (fscrypt)
* file-based verity support (fsverity)
[1] Filesystems with a block size of 1k may see a limit imposed by the [1] Filesystems with a block size of 1k may see a limit imposed by the
directory hash tree having a maximum depth of two. directory hash tree having a maximum depth of two.

View File

@@ -64,6 +64,7 @@ configure specific aspects of kernel behavior to your liking.
binderfs binderfs
binfmt-misc binfmt-misc
blockdev/index blockdev/index
bootconfig
braille-console braille-console
btmrvl btmrvl
cgroup-v1/index cgroup-v1/index
@@ -76,6 +77,7 @@ configure specific aspects of kernel behavior to your liking.
device-mapper/index device-mapper/index
efi-stub efi-stub
ext4 ext4
nfs/index
gpio/index gpio/index
highuid highuid
hw_random hw_random

View File

@@ -437,6 +437,12 @@
no delay (0). no delay (0).
Format: integer Format: integer
bootconfig [KNL]
Extended command line options can be added to an initrd
and this will cause the kernel to look for it.
See Documentation/admin-guide/bootconfig.rst
bert_disable [ACPI] bert_disable [ACPI]
Disable BERT OS support on buggy BIOSes. Disable BERT OS support on buggy BIOSes.
@@ -511,7 +517,7 @@
1 -- check protection requested by application. 1 -- check protection requested by application.
Default value is set via a kernel config option. Default value is set via a kernel config option.
Value can be changed at runtime via Value can be changed at runtime via
/selinux/checkreqprot. /sys/fs/selinux/checkreqprot.
cio_ignore= [S390] cio_ignore= [S390]
See Documentation/s390/common_io.rst for details. See Documentation/s390/common_io.rst for details.
@@ -834,6 +840,18 @@
dump out devices still on the deferred probe list after dump out devices still on the deferred probe list after
retrying. retrying.
dfltcc= [HW,S390]
Format: { on | off | def_only | inf_only | always }
on: s390 zlib hardware support for compression on
level 1 and decompression (default)
off: No s390 zlib hardware support
def_only: s390 zlib hardware support for deflate
only (compression on level 1)
inf_only: s390 zlib hardware support for inflate
only (decompression)
always: Same as 'on' but ignores the selected compression
level always using hardware support (used for debugging)
dhash_entries= [KNL] dhash_entries= [KNL]
Set number of hash buckets for dentry cache. Set number of hash buckets for dentry cache.
@@ -1165,10 +1183,10 @@
efi= [EFI] efi= [EFI]
Format: { "old_map", "nochunk", "noruntime", "debug", Format: { "old_map", "nochunk", "noruntime", "debug",
"nosoftreserve" } "nosoftreserve", "disable_early_pci_dma",
"no_disable_early_pci_dma" }
old_map [X86-64]: switch to the old ioremap-based EFI old_map [X86-64]: switch to the old ioremap-based EFI
runtime services mapping. 32-bit still uses this one by runtime services mapping. [Needs CONFIG_X86_UV=y]
default.
nochunk: disable reading files in "chunks" in the EFI nochunk: disable reading files in "chunks" in the EFI
boot stub, as chunking can cause problems with some boot stub, as chunking can cause problems with some
firmware implementations. firmware implementations.
@@ -1180,6 +1198,10 @@
claim. Specify efi=nosoftreserve to disable this claim. Specify efi=nosoftreserve to disable this
reservation and treat the memory by its base type reservation and treat the memory by its base type
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM"). (i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
disable_early_pci_dma: Disable the busmaster bit on all
PCI bridges while in the EFI boot stub
no_disable_early_pci_dma: Leave the busmaster bit set
on all PCI bridges while in the EFI boot stub
efi_no_storage_paranoia [EFI; X86] efi_no_storage_paranoia [EFI; X86]
Using this parameter you can use more than 50% of Using this parameter you can use more than 50% of
@@ -1245,7 +1267,8 @@
0 -- permissive (log only, no denials). 0 -- permissive (log only, no denials).
1 -- enforcing (deny and log). 1 -- enforcing (deny and log).
Default value is 0. Default value is 0.
Value can be changed at runtime via /selinux/enforce. Value can be changed at runtime via
/sys/fs/selinux/enforce.
erst_disable [ACPI] erst_disable [ACPI]
Disable Error Record Serialization Table (ERST) Disable Error Record Serialization Table (ERST)
@@ -1933,10 +1956,32 @@
<cpu number> begins at 0 and the maximum value is <cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1". "number of CPUs in system - 1".
managed_irq
Isolate from being targeted by managed interrupts
which have an interrupt mask containing isolated
CPUs. The affinity of managed interrupts is
handled by the kernel and cannot be changed via
the /proc/irq/* interfaces.
This isolation is best effort and only effective
if the automatically assigned interrupt mask of a
device queue contains isolated and housekeeping
CPUs. If housekeeping CPUs are online then such
interrupts are directed to the housekeeping CPU
so that IO submitted on the housekeeping CPU
cannot disturb the isolated CPU.
If a queue's affinity mask contains only isolated
CPUs then this parameter has no effect on the
interrupt routing decision, though interrupts are
only delivered when tasks running on those
isolated CPUs submit IO. IO submitted on
housekeeping CPUs has no influence on those
queues.
The format of <cpu-list> is described above. The format of <cpu-list> is described above.
iucv= [HW,NET] iucv= [HW,NET]
ivrs_ioapic [HW,X86_64] ivrs_ioapic [HW,X86_64]
@@ -3978,6 +4023,19 @@
test until boot completes in order to avoid test until boot completes in order to avoid
interference. interference.
rcuperf.kfree_rcu_test= [KNL]
Set to measure performance of kfree_rcu() flooding.
rcuperf.kfree_nthreads= [KNL]
The number of threads running loops of kfree_rcu().
rcuperf.kfree_alloc_num= [KNL]
Number of allocations and frees done in an iteration.
rcuperf.kfree_loops= [KNL]
Number of loops doing rcuperf.kfree_alloc_num number
of allocations and frees.
rcuperf.nreaders= [KNL] rcuperf.nreaders= [KNL]
Set number of RCU readers. The value -1 selects Set number of RCU readers. The value -1 selects
N, where N is the number of CPUs. A value N, where N is the number of CPUs. A value
@@ -4348,9 +4406,7 @@
See security/selinux/Kconfig help text. See security/selinux/Kconfig help text.
0 -- disable. 0 -- disable.
1 -- enable. 1 -- enable.
Default value is set via kernel config option. Default value is 1.
If enabled at boot time, /selinux/disable can be used
later to disable prior to initial policy load.
apparmor= [APPARMOR] Disable or enable AppArmor at boot time apparmor= [APPARMOR] Disable or enable AppArmor at boot time
Format: { "0" | "1" } Format: { "0" | "1" }

View File

@@ -1,6 +1,7 @@
===================
NFS Fault Injection
===================
Fault Injection
===============
Fault injection is a method for forcing errors that may not normally occur, or Fault injection is a method for forcing errors that may not normally occur, or
may be difficult to reproduce. Forcing these errors in a controlled environment may be difficult to reproduce. Forcing these errors in a controlled environment
can help the developer find and fix bugs before their code is shipped in a can help the developer find and fix bugs before their code is shipped in a

View File

@@ -0,0 +1,15 @@
=============
NFS
=============
.. toctree::
:maxdepth: 1
nfs-client
nfsroot
nfs-rdma
nfsd-admin-interfaces
nfs-idmapper
pnfs-block-server
pnfs-scsi-server
fault_injection

View File

@@ -1,3 +1,6 @@
==========
NFS Client
==========
The NFS client The NFS client
============== ==============
@@ -59,10 +62,11 @@ The DNS resolver
NFSv4 allows for one server to refer the NFS client to data that has been NFSv4 allows for one server to refer the NFS client to data that has been
migrated onto another server by means of the special "fs_locations" migrated onto another server by means of the special "fs_locations"
attribute. See attribute. See `RFC3530 Section 6: Filesystem Migration and Replication`_ and
http://tools.ietf.org/html/rfc3530#section-6 `Implementation Guide for Referrals in NFSv4`_.
and
http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 .. _RFC3530 Section 6\: Filesystem Migration and Replication: http://tools.ietf.org/html/rfc3530#section-6
.. _Implementation Guide for Referrals in NFSv4: http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
The fs_locations information can take the form of either an ip address and The fs_locations information can take the form of either an ip address and
a path, or a DNS hostname and a path. The latter requires the NFS client to a path, or a DNS hostname and a path. The latter requires the NFS client to
@@ -78,8 +82,8 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
(2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
(may be changed using the 'nfs.cache_getent' kernel boot parameter) (may be changed using the 'nfs.cache_getent' kernel boot parameter)
is run, with two arguments: is run, with two arguments:
- the cache name, "dns_resolve" - the cache name, "dns_resolve"
- the hostname to resolve - the hostname to resolve
(3) After looking up the corresponding ip address, the helper script (3) After looking up the corresponding ip address, the helper script
writes the result into the rpc_pipefs pseudo-file writes the result into the rpc_pipefs pseudo-file
@@ -94,43 +98,44 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
script, and <ttl> is the 'time to live' of this cache entry (in script, and <ttl> is the 'time to live' of this cache entry (in
units of seconds). units of seconds).
Note: If <ip address> is invalid, say the string "0", then a negative .. note::
entry is created, which will cause the kernel to treat the hostname If <ip address> is invalid, say the string "0", then a negative
as having no valid DNS translation. entry is created, which will cause the kernel to treat the hostname
as having no valid DNS translation.
A basic sample /sbin/nfs_cache_getent A basic sample /sbin/nfs_cache_getent
===================================== =====================================
.. code-block:: sh
#!/bin/bash #!/bin/bash
# #
ttl=600 ttl=600
# #
cut=/usr/bin/cut cut=/usr/bin/cut
getent=/usr/bin/getent getent=/usr/bin/getent
rpc_pipefs=/var/lib/nfs/rpc_pipefs rpc_pipefs=/var/lib/nfs/rpc_pipefs
# #
die() die()
{ {
echo "Usage: $0 cache_name entry_name" echo "Usage: $0 cache_name entry_name"
exit 1 exit 1
} }
[ $# -lt 2 ] && die [ $# -lt 2 ] && die
cachename="$1" cachename="$1"
cache_path=${rpc_pipefs}/cache/${cachename}/channel cache_path=${rpc_pipefs}/cache/${cachename}/channel
case "${cachename}" in
dns_resolve)
name="$2"
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
[ -z "${result}" ] && result="0"
;;
*)
die
;;
esac
echo "${result} ${name} ${ttl}" >${cache_path}
case "${cachename}" in
dns_resolve)
name="$2"
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
[ -z "${result}" ] && result="0"
;;
*)
die
;;
esac
echo "${result} ${name} ${ttl}" >${cache_path}

View File

@@ -1,7 +1,7 @@
=============
NFS ID Mapper
=============
=========
ID Mapper
=========
Id mapper is used by NFS to translate user and group ids into names, and to Id mapper is used by NFS to translate user and group ids into names, and to
translate user and group names into ids. Part of this translation involves translate user and group names into ids. Part of this translation involves
performing an upcall to userspace to request the information. There are two performing an upcall to userspace to request the information. There are two
@@ -20,22 +20,24 @@ legacy rpc.idmap daemon for the id mapping. This result will be stored
in a custom NFS idmap cache. in a custom NFS idmap cache.
===========
Configuring Configuring
=========== ===========
The file /etc/request-key.conf will need to be modified so /sbin/request-key can The file /etc/request-key.conf will need to be modified so /sbin/request-key can
direct the upcall. The following line should be added: direct the upcall. The following line should be added:
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... ``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
#====== ======= =============== =============== =============================== ``#====== ======= =============== =============== ===============================``
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600 ``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap. This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
The last parameter, 600, defines how many seconds into the future the key will The last parameter, 600, defines how many seconds into the future the key will
expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
is not specified, nfs.idmap will default to 600 seconds. is not specified, nfs.idmap will default to 600 seconds.
id mapper uses for key descriptions: id mapper uses for key descriptions::
uid: Find the UID for the given user uid: Find the UID for the given user
gid: Find the GID for the given group gid: Find the GID for the given group
user: Find the user name for the given UID user: Find the user name for the given UID
@@ -45,23 +47,24 @@ You can handle any of these individually, rather than using the generic upcall
program. If you would like to use your own program for a uid lookup then you program. If you would like to use your own program for a uid lookup then you
would edit your request-key.conf so it look similar to this: would edit your request-key.conf so it look similar to this:
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... ``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
#====== ======= =============== =============== =============================== ``#====== ======= =============== =============== ===============================``
create id_resolver uid:* * /some/other/program %k %d 600 ``create id_resolver uid:* * /some/other/program %k %d 600``
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600 ``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
Notice that the new line was added above the line for the generic program. Notice that the new line was added above the line for the generic program.
request-key will find the first matching line and corresponding program. In request-key will find the first matching line and corresponding program. In
this case, /some/other/program will handle all uid lookups and this case, /some/other/program will handle all uid lookups and
/usr/sbin/nfs.idmap will handle gid, user, and group lookups. /usr/sbin/nfs.idmap will handle gid, user, and group lookups.
See <file:Documentation/security/keys/request-key.rst> for more information See Documentation/security/keys/request-key.rst for more information
about the request-key function. about the request-key function.
=========
nfs.idmap nfs.idmap
========= =========
nfs.idmap is designed to be called by request-key, and should not be run "by nfs.idmap is designed to be called by request-key, and should not be run "by
hand". This program takes two arguments, a serialized key and a key hand". This program takes two arguments, a serialized key and a key
description. The serialized key is first converted into a key_serial_t, and description. The serialized key is first converted into a key_serial_t, and

View File

@@ -0,0 +1,292 @@
===================
Setting up NFS/RDMA
===================
:Author:
NetApp and Open Grid Computing (May 29, 2008)
.. warning::
This document is probably obsolete.
Overview
========
This document describes how to install and setup the Linux NFS/RDMA client
and server software.
The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
was first included in the following release, Linux 2.6.25.
In our testing, we have obtained excellent performance results (full 10Gbit
wire bandwidth at minimal client CPU) under many workloads. The code passes
the full Connectathon test suite and operates over both Infiniband and iWARP
RDMA adapters.
Getting Help
============
If you get stuck, you can ask questions on the
nfs-rdma-devel@lists.sourceforge.net mailing list.
Installation
============
These instructions are a step by step guide to building a machine for
use with NFS/RDMA.
- Install an RDMA device
Any device supported by the drivers in drivers/infiniband/hw is acceptable.
Testing has been performed using several Mellanox-based IB cards, the
Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
- Install a Linux distribution and tools
The first kernel release to contain both the NFS/RDMA client and server was
Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
Linux kernel release should be installed.
The procedures described in this document have been tested with
distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
- Install nfs-utils-1.1.2 or greater on the client
An NFS/RDMA mount point can be obtained by using the mount.nfs command in
nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
version with support for NFS/RDMA mounts, but for various reasons we
recommend using nfs-utils-1.1.2 or greater). To see which version of
mount.nfs you are using, type:
.. code-block:: sh
$ /sbin/mount.nfs -V
If the version is less than 1.1.2 or the command does not exist,
you should install the latest version of nfs-utils.
Download the latest package from: http://www.kernel.org/pub/linux/utils/nfs
Uncompress the package and follow the installation instructions.
If you will not need the idmapper and gssd executables (you do not need
these to create an NFS/RDMA enabled mount command), the installation
process can be simplified by disabling these features when running
configure:
.. code-block:: sh
$ ./configure --disable-gss --disable-nfsv4
To build nfs-utils you will need the tcp_wrappers package installed. For
more information on this see the package's README and INSTALL files.
After building the nfs-utils package, there will be a mount.nfs binary in
the utils/mount directory. This binary can be used to initiate NFS v2, v3,
or v4 mounts. To initiate a v4 mount, the binary must be called
mount.nfs4. The standard technique is to create a symlink called
mount.nfs4 to mount.nfs.
This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
.. code-block:: sh
$ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
In this location, mount.nfs will be invoked automatically for NFS mounts
by the system mount command.
.. note::
mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
on the NFS client machine. You do not need this specific version of
nfs-utils on the server. Furthermore, only the mount.nfs command from
nfs-utils-1.1.2 is needed on the client.
- Install a Linux kernel with NFS/RDMA
The NFS/RDMA client and server are both included in the mainline Linux
kernel version 2.6.25 and later. This and other versions of the Linux
kernel can be found at: https://www.kernel.org/pub/linux/kernel/
Download the sources and place them in an appropriate location.
- Configure the RDMA stack
Make sure your kernel configuration has RDMA support enabled. Under
Device Drivers -> InfiniBand support, update the kernel configuration
to enable InfiniBand support [NOTE: the option name is misleading. Enabling
InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
iWARP adapter support (amso, cxgb3, etc.).
If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
- Configure the NFS client and server
Your kernel configuration must also have NFS file system support and/or
NFS server support enabled. These and other NFS related configuration
options can be found under File Systems -> Network File Systems.
- Build, install, reboot
The NFS/RDMA code will be enabled automatically if NFS and RDMA
are turned on. The NFS/RDMA client and server are configured via the hidden
SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
value of SUNRPC_XPRT_RDMA will be:
#. N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
and server will not be built
#. M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
in this case the NFS/RDMA client and server will be built as modules
#. Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
and server will be built into the kernel
Therefore, if you have followed the steps above and turned no NFS and RDMA,
the NFS/RDMA client and server will be built.
Build a new kernel, install it, boot it.
Check RDMA and NFS Setup
========================
Before configuring the NFS/RDMA software, it is a good idea to test
your new kernel to ensure that the kernel is working correctly.
In particular, it is a good idea to verify that the RDMA stack
is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
is working properly.
- Check RDMA Setup
If you built the RDMA components as modules, load them at
this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
card:
.. code-block:: sh
$ modprobe ib_mthca
$ modprobe ib_ipoib
If you are using InfiniBand, make sure there is a Subnet Manager (SM)
running on the network. If your IB switch has an embedded SM, you can
use it. Otherwise, you will need to run an SM, such as OpenSM, on one
of your end nodes.
If an SM is running on your network, you should see the following:
.. code-block:: sh
$ cat /sys/class/infiniband/driverX/ports/1/state
4: ACTIVE
where driverX is mthca0, ipath5, ehca3, etc.
To further test the InfiniBand software stack, use IPoIB (this
assumes you have two IB hosts named host1 and host2):
.. code-block:: sh
host1$ ip link set dev ib0 up
host1$ ip address add dev ib0 a.b.c.x
host2$ ip link set dev ib0 up
host2$ ip address add dev ib0 a.b.c.y
host1$ ping a.b.c.y
host2$ ping a.b.c.x
For other device types, follow the appropriate procedures.
- Check NFS Setup
For the NFS components enabled above (client and/or server),
test their functionality over standard Ethernet using TCP/IP or UDP/IP.
NFS/RDMA Setup
==============
We recommend that you use two machines, one to act as the client and
one to act as the server.
One time configuration:
-----------------------
- On the server system, configure the /etc/exports file and start the NFS/RDMA server.
Exports entries with the following formats have been tested::
/vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
/vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
The IP address(es) is(are) the client's IPoIB address for an InfiniBand
HCA or the client's iWARP address(es) for an RNIC.
.. note::
The "insecure" option must be used because the NFS/RDMA client does
not use a reserved port.
Each time a machine boots:
--------------------------
- Load and configure the RDMA drivers
For InfiniBand using a Mellanox adapter:
.. code-block:: sh
$ modprobe ib_mthca
$ modprobe ib_ipoib
$ ip li set dev ib0 up
$ ip addr add dev ib0 a.b.c.d
.. note::
Please use unique addresses for the client and server!
- Start the NFS server
If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
kernel config), load the RDMA transport module:
.. code-block:: sh
$ modprobe svcrdma
Regardless of how the server was built (module or built-in), start the
server:
.. code-block:: sh
$ /etc/init.d/nfs start
or
.. code-block:: sh
$ service nfs start
Instruct the server to listen on the RDMA transport:
.. code-block:: sh
$ echo rdma 20049 > /proc/fs/nfsd/portlist
- On the client system
If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
kernel config), load the RDMA client module:
.. code-block:: sh
$ modprobe xprtrdma.ko
Regardless of how the client was built (module or built-in), use this
command to mount the NFS/RDMA server:
.. code-block:: sh
$ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt
To verify that the mount is using RDMA, run "cat /proc/mounts" and check
the "proto" field for the given mount.
Congratulations! You're using NFS/RDMA!

View File

@@ -1,5 +1,6 @@
==================================
Administrative interfaces for nfsd Administrative interfaces for nfsd
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ==================================
Note that normally these interfaces are used only by the utilities in Note that normally these interfaces are used only by the utilities in
nfs-utils. nfs-utils.
@@ -13,18 +14,16 @@ nfsd/threads.
Before doing that, NFSD can be told which sockets to listen on by Before doing that, NFSD can be told which sockets to listen on by
writing to nfsd/portlist; that write may be: writing to nfsd/portlist; that write may be:
- an ascii-encoded file descriptor, which should refer to a - an ascii-encoded file descriptor, which should refer to a
bound (and listening, for tcp) socket, or bound (and listening, for tcp) socket, or
- "transportname port", where transportname is currently either - "transportname port", where transportname is currently either
"udp", "tcp", or "rdma". "udp", "tcp", or "rdma".
If nfsd is started without doing any of these, then it will create one If nfsd is started without doing any of these, then it will create one
udp and one tcp listener at port 2049 (see nfsd_init_socks). udp and one tcp listener at port 2049 (see nfsd_init_socks).
On startup, nfsd and lockd grace periods start. On startup, nfsd and lockd grace periods start. nfsd is shut down by a write of
0 to nfsd/threads. All locks and state are thrown away at that point.
nfsd is shut down by a write of 0 to nfsd/threads. All locks and state
are thrown away at that point.
Between startup and shutdown, the number of threads may be adjusted up Between startup and shutdown, the number of threads may be adjusted up
or down by additional writes to nfsd/threads or by writes to or down by additional writes to nfsd/threads or by writes to
@@ -34,7 +33,7 @@ For more detail about files under nfsd/ and what they control, see
fs/nfsd/nfsctl.c; most of them have detailed comments. fs/nfsd/nfsctl.c; most of them have detailed comments.
Implementation notes Implementation notes
^^^^^^^^^^^^^^^^^^^^ ====================
Note that the rpc server requires the caller to serialize addition and Note that the rpc server requires the caller to serialize addition and
removal of listening sockets, and startup and shutdown of the server. removal of listening sockets, and startup and shutdown of the server.

View File

@@ -1,27 +1,34 @@
===============================================
Mounting the root filesystem via NFS (nfsroot) Mounting the root filesystem via NFS (nfsroot)
=============================================== ===============================================
Written 1996 by Gero Kuhlmann <gero@gkminix.han.de> :Authors:
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz> Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
Updated 2006 by Horms <horms@verge.net.au> Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
Updated 2006 by Horms <horms@verge.net.au>
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
In order to use a diskless system, such as an X-terminal or printer server In order to use a diskless system, such as an X-terminal or printer server for
for example, it is necessary for the root filesystem to be present on a example, it is necessary for the root filesystem to be present on a non-disk
non-disk device. This may be an initramfs (see Documentation/filesystems/ device. This may be an initramfs (see
ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/admin-guide/initrd.rst) or a Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see
filesystem mounted via NFS. The following text describes on how to use NFS Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The
for the root filesystem. For the rest of this text 'client' means the following text describes on how to use NFS for the root filesystem. For the rest
diskless system, and 'server' means the NFS server. of this text 'client' means the diskless system, and 'server' means the NFS
server.
1.) Enabling nfsroot capabilities Enabling nfsroot capabilities
----------------------------- =============================
In order to use nfsroot, NFS client support needs to be selected as In order to use nfsroot, NFS client support needs to be selected as
built-in during configuration. Once this has been selected, the nfsroot built-in during configuration. Once this has been selected, the nfsroot
@@ -34,8 +41,8 @@ DHCP, BOOTP and RARP is safe.
2.) Kernel command line Kernel command line
------------------- ===================
When the kernel has been loaded by a boot loader (see below) it needs to be When the kernel has been loaded by a boot loader (see below) it needs to be
told what root fs device to use. And in the case of nfsroot, where to find told what root fs device to use. And in the case of nfsroot, where to find
@@ -44,19 +51,17 @@ This can be established using the following kernel command line parameters:
root=/dev/nfs root=/dev/nfs
This is necessary to enable the pseudo-NFS-device. Note that it's not a This is necessary to enable the pseudo-NFS-device. Note that it's not a
real device but just a synonym to tell the kernel to use NFS instead of real device but just a synonym to tell the kernel to use NFS instead of
a real device. a real device.
nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>] nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
If the `nfsroot' parameter is NOT given on the command line, If the `nfsroot' parameter is NOT given on the command line,
the default "/tftpboot/%s" will be used. the default ``"/tftpboot/%s"`` will be used.
<server-ip> Specifies the IP address of the NFS server. <server-ip> Specifies the IP address of the NFS server.
The default address is determined by the `ip' parameter The default address is determined by the ip parameter
(see below). This parameter allows the use of different (see below). This parameter allows the use of different
servers for IP autoconfiguration and NFS. servers for IP autoconfiguration and NFS.
@@ -66,7 +71,8 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
IP address. IP address.
<nfs-options> Standard NFS options. All options are separated by commas. <nfs-options> Standard NFS options. All options are separated by commas.
The following defaults are used: The following defaults are used::
port = as given by server portmap daemon port = as given by server portmap daemon
rsize = 4096 rsize = 4096
wsize = 4096 wsize = 4096
@@ -79,13 +85,11 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
flags = hard, nointr, noposix, cto, ac flags = hard, nointr, noposix, cto, ac
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>: ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:<dns0-ip>:<dns1-ip>:<ntp0-ip>
<dns0-ip>:<dns1-ip>:<ntp0-ip>
This parameter tells the kernel how to configure IP addresses of devices This parameter tells the kernel how to configure IP addresses of devices
and also how to set up the IP routing table. It was originally called and also how to set up the IP routing table. It was originally called
`nfsaddrs', but now the boot-time IP configuration works independently of nfsaddrs, but now the boot-time IP configuration works independently of
NFS, so it was renamed to `ip' and the old name remained as an alias for NFS, so it was renamed to ip and the old name remained as an alias for
compatibility reasons. compatibility reasons.
If this parameter is missing from the kernel command line, all fields are If this parameter is missing from the kernel command line, all fields are
@@ -93,17 +97,17 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
this means that the kernel tries to configure everything using this means that the kernel tries to configure everything using
autoconfiguration. autoconfiguration.
The <autoconf> parameter can appear alone as the value to the `ip' The <autoconf> parameter can appear alone as the value to the ip
parameter (without all the ':' characters before). If the value is parameter (without all the ':' characters before). If the value is
"ip=off" or "ip=none", no autoconfiguration will take place, otherwise "ip=off" or "ip=none", no autoconfiguration will take place, otherwise
autoconfiguration will take place. The most common way to use this autoconfiguration will take place. The most common way to use this
is "ip=dhcp". is "ip=dhcp".
<client-ip> IP address of the client. <client-ip> IP address of the client.
Default: Determined using autoconfiguration. Default: Determined using autoconfiguration.
<server-ip> IP address of the NFS server. If RARP is used to determine <server-ip> IP address of the NFS server.
If RARP is used to determine
the client address and this parameter is NOT empty only the client address and this parameter is NOT empty only
replies from the specified server are accepted. replies from the specified server are accepted.
@@ -115,19 +119,19 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
(see below). (see below).
Default: Determined using autoconfiguration. Default: Determined using autoconfiguration.
The address of the autoconfiguration server is used. The address of the autoconfiguration server is used.
<gw-ip> IP address of a gateway if the server is on a different subnet. <gw-ip> IP address of a gateway if the server is on a different subnet.
Default: Determined using autoconfiguration. Default: Determined using autoconfiguration.
<netmask> Netmask for local network interface. If unspecified <netmask> Netmask for local network interface.
the netmask is derived from the client IP address assuming If unspecified the netmask is derived from the client IP address
classful addressing. assuming classful addressing.
Default: Determined using autoconfiguration. Default: Determined using autoconfiguration.
<hostname> Name of the client. If a '.' character is present, anything <hostname> Name of the client.
If a '.' character is present, anything
before the first '.' is used as the client's hostname, and anything before the first '.' is used as the client's hostname, and anything
after it is used as its NIS domain name. May be supplied by after it is used as its NIS domain name. May be supplied by
autoconfiguration, but its absence will not trigger autoconfiguration. autoconfiguration, but its absence will not trigger autoconfiguration.
@@ -138,21 +142,21 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
Default: Client IP address is used in ASCII notation. Default: Client IP address is used in ASCII notation.
<device> Name of network device to use. <device> Name of network device to use.
Default: If the host only has one device, it is used. Default: If the host only has one device, it is used.
Otherwise the device is determined using Otherwise the device is determined using
autoconfiguration. This is done by sending autoconfiguration. This is done by sending
autoconfiguration requests out of all devices, autoconfiguration requests out of all devices,
and using the device that received the first reply. and using the device that received the first reply.
<autoconf> Method to use for autoconfiguration. In the case of options <autoconf> Method to use for autoconfiguration.
which specify multiple autoconfiguration protocols, In the case of options
which specify multiple autoconfiguration protocols,
requests are sent using all protocols, and the first one requests are sent using all protocols, and the first one
to reply is used. to reply is used.
Only autoconfiguration protocols that have been compiled Only autoconfiguration protocols that have been compiled
into the kernel will be used, regardless of the value of into the kernel will be used, regardless of the value of
this option. this option::
off or none: don't use autoconfiguration off or none: don't use autoconfiguration
(do static IP assignment instead) (do static IP assignment instead)
@@ -221,7 +225,6 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
nfsrootdebug nfsrootdebug
This parameter enables debugging messages to appear in the kernel This parameter enables debugging messages to appear in the kernel
log at boot time so that administrators can verify that the correct log at boot time so that administrators can verify that the correct
NFS mount options, server address, and root path are passed to the NFS mount options, server address, and root path are passed to the
@@ -229,36 +232,32 @@ nfsrootdebug
rdinit=<executable file> rdinit=<executable file>
To specify which file contains the program that starts system To specify which file contains the program that starts system
initialization, administrators can use this command line parameter. initialization, administrators can use this command line parameter.
The default value of this parameter is "/init". If the specified The default value of this parameter is "/init". If the specified
file exists and the kernel can execute it, root filesystem related file exists and the kernel can execute it, root filesystem related
kernel command line parameters, including `nfsroot=', are ignored. kernel command line parameters, including 'nfsroot=', are ignored.
A description of the process of mounting the root file system can be A description of the process of mounting the root file system can be
found in: found in Documentation/driver-api/early-userspace/early_userspace_support.rst
Documentation/driver-api/early-userspace/early_userspace_support.rst
Boot Loader
===========
3.) Boot Loader
----------
To get the kernel into memory different approaches can be used. To get the kernel into memory different approaches can be used.
They depend on various facilities being available: They depend on various facilities being available:
3.1) Booting from a floppy using syslinux - Booting from a floppy using syslinux
When building kernels, an easy way to create a boot floppy that uses When building kernels, an easy way to create a boot floppy that uses
syslinux is to use the zdisk or bzdisk make targets which use zimage syslinux is to use the zdisk or bzdisk make targets which use zimage
and bzimage images respectively. Both targets accept the and bzimage images respectively. Both targets accept the
FDARGS parameter which can be used to set the kernel command line. FDARGS parameter which can be used to set the kernel command line.
e.g. e.g::
make bzdisk FDARGS="root=/dev/nfs" make bzdisk FDARGS="root=/dev/nfs"
Note that the user running this command will need to have Note that the user running this command will need to have
@@ -267,32 +266,36 @@ They depend on various facilities being available:
For more information on syslinux, including how to create bootdisks For more information on syslinux, including how to create bootdisks
for prebuilt kernels, see http://syslinux.zytor.com/ for prebuilt kernels, see http://syslinux.zytor.com/
N.B: Previously it was possible to write a kernel directly to .. note::
a floppy using dd, configure the boot device using rdev, and Previously it was possible to write a kernel directly to
boot using the resulting floppy. Linux no longer supports this a floppy using dd, configure the boot device using rdev, and
method of booting. boot using the resulting floppy. Linux no longer supports this
method of booting.
3.2) Booting from a cdrom using isolinux - Booting from a cdrom using isolinux
When building kernels, an easy way to create a bootable cdrom that When building kernels, an easy way to create a bootable cdrom that
uses isolinux is to use the isoimage target which uses a bzimage uses isolinux is to use the isoimage target which uses a bzimage
image. Like zdisk and bzdisk, this target accepts the FDARGS image. Like zdisk and bzdisk, this target accepts the FDARGS
parameter which can be used to set the kernel command line. parameter which can be used to set the kernel command line.
e.g. e.g::
make isoimage FDARGS="root=/dev/nfs" make isoimage FDARGS="root=/dev/nfs"
The resulting iso image will be arch/<ARCH>/boot/image.iso The resulting iso image will be arch/<ARCH>/boot/image.iso
This can be written to a cdrom using a variety of tools including This can be written to a cdrom using a variety of tools including
cdrecord. cdrecord.
e.g. e.g::
cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
For more information on isolinux, including how to create bootdisks For more information on isolinux, including how to create bootdisks
for prebuilt kernels, see http://syslinux.zytor.com/ for prebuilt kernels, see http://syslinux.zytor.com/
3.2) Using LILO - Using LILO
When using LILO all the necessary command line parameters may be When using LILO all the necessary command line parameters may be
specified using the 'append=' directive in the LILO configuration specified using the 'append=' directive in the LILO configuration
file. file.
@@ -300,15 +303,19 @@ They depend on various facilities being available:
However, to use the 'root=' directive you also need to create However, to use the 'root=' directive you also need to create
a dummy root device, which may be removed after LILO is run. a dummy root device, which may be removed after LILO is run.
mknod /dev/boot255 c 0 255 e.g::
mknod /dev/boot255 c 0 255
For information on configuring LILO, please refer to its documentation. For information on configuring LILO, please refer to its documentation.
3.3) Using GRUB - Using GRUB
When using GRUB, kernel parameter are simply appended after the kernel When using GRUB, kernel parameter are simply appended after the kernel
specification: kernel <kernel> <parameters> specification: kernel <kernel> <parameters>
3.4) Using loadlin - Using loadlin
loadlin may be used to boot Linux from a DOS command prompt without loadlin may be used to boot Linux from a DOS command prompt without
requiring a local hard disk to mount as root. This has not been requiring a local hard disk to mount as root. This has not been
thoroughly tested by the authors of this document, but in general thoroughly tested by the authors of this document, but in general
@@ -317,7 +324,8 @@ They depend on various facilities being available:
Please refer to the loadlin documentation for further information. Please refer to the loadlin documentation for further information.
3.5) Using a boot ROM - Using a boot ROM
This is probably the most elegant way of booting a diskless client. This is probably the most elegant way of booting a diskless client.
With a boot ROM the kernel is loaded using the TFTP protocol. The With a boot ROM the kernel is loaded using the TFTP protocol. The
authors of this document are not aware of any no commercial boot authors of this document are not aware of any no commercial boot
@@ -326,7 +334,8 @@ They depend on various facilities being available:
etherboot, both of which are available on sunsite.unc.edu, and both etherboot, both of which are available on sunsite.unc.edu, and both
of which contain everything you need to boot a diskless Linux client. of which contain everything you need to boot a diskless Linux client.
3.6) Using pxelinux - Using pxelinux
Pxelinux may be used to boot linux using the PXE boot loader Pxelinux may be used to boot linux using the PXE boot loader
which is present on many modern network cards. which is present on many modern network cards.
@@ -342,8 +351,8 @@ They depend on various facilities being available:
4.) Credits Credits
------- =======
The nfsroot code in the kernel and the RARP support have been written The nfsroot code in the kernel and the RARP support have been written
by Gero Kuhlmann <gero@gkminix.han.de>. by Gero Kuhlmann <gero@gkminix.han.de>.

View File

@@ -1,4 +1,6 @@
===================================
pNFS block layout server user guide pNFS block layout server user guide
===================================
The Linux NFS server now supports the pNFS block layout extension. In this The Linux NFS server now supports the pNFS block layout extension. In this
case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
@@ -22,16 +24,19 @@ If the nfsd server needs to fence a non-responding client it calls
/sbin/nfsd-recall-failed with the first argument set to the IP address of /sbin/nfsd-recall-failed with the first argument set to the IP address of
the client, and the second argument set to the device node without the /dev the client, and the second argument set to the device node without the /dev
prefix for the file system to be fenced. Below is an example file that shows prefix for the file system to be fenced. Below is an example file that shows
how to translate the device into a serial number from SCSI EVPD 0x80: how to translate the device into a serial number from SCSI EVPD 0x80::
cat > /sbin/nfsd-recall-failed << EOF cat > /sbin/nfsd-recall-failed << EOF
#!/bin/sh
CLIENT="$1" .. code-block:: sh
DEV="/dev/$2"
EVPD=`sg_inq --page=0x80 ${DEV} | \
grep "Unit serial number:" | \
awk -F ': ' '{print $2}'`
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log #!/bin/sh
EOF
CLIENT="$1"
DEV="/dev/$2"
EVPD=`sg_inq --page=0x80 ${DEV} | \
grep "Unit serial number:" | \
awk -F ': ' '{print $2}'`
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
EOF

View File

@@ -1,4 +1,5 @@
==================================
pNFS SCSI layout server user guide pNFS SCSI layout server user guide
================================== ==================================

View File

@@ -506,6 +506,9 @@ object corresponding to it, as follows:
``disable`` ``disable``
Whether or not this idle state is disabled. Whether or not this idle state is disabled.
``default_status``
The default status of this state, "enabled" or "disabled".
``latency`` ``latency``
Exit latency of the idle state in microseconds. Exit latency of the idle state in microseconds.
@@ -629,16 +632,16 @@ class priority list and destroyed. If that happens, the priority list mechanism
will be used, again, to determine the new effective value for the whole list will be used, again, to determine the new effective value for the whole list
and that value will become the new real constraint. and that value will become the new real constraint.
In turn, for each CPU there is only one resume latency PM QoS request In turn, for each CPU there is one resume latency PM QoS request associated with
associated with the :file:`power/pm_qos_resume_latency_us` file under the :file:`power/pm_qos_resume_latency_us` file under
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes :file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
this single PM QoS request to be updated regardless of which user space this single PM QoS request to be updated regardless of which user space
process does that. In other words, this PM QoS request is shared by the entire process does that. In other words, this PM QoS request is shared by the entire
user space, so access to the file associated with it needs to be arbitrated user space, so access to the file associated with it needs to be arbitrated
to avoid confusion. [Arguably, the only legitimate use of this mechanism in to avoid confusion. [Arguably, the only legitimate use of this mechanism in
practice is to pin a process to the CPU in question and let it use the practice is to pin a process to the CPU in question and let it use the
``sysfs`` interface to control the resume latency constraint for it.] It ``sysfs`` interface to control the resume latency constraint for it.] It is
still only is a request, however. It is a member of a priority list used to still only a request, however. It is an entry in a priority list used to
determine the effective value to be set as the resume latency constraint for the determine the effective value to be set as the resume latency constraint for the
CPU in question every time the list of requests is updated this way or another CPU in question every time the list of requests is updated this way or another
(there may be other requests coming from kernel code in that list). (there may be other requests coming from kernel code in that list).

View File

@@ -0,0 +1,268 @@
.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>
==============================================
``intel_idle`` CPU Idle Time Management Driver
==============================================
:Copyright: |copy| 2020 Intel Corporation
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
General Information
===================
``intel_idle`` is a part of the
:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
(``CPUIdle``). It is the default CPU idle time management driver for the
Nehalem and later generations of Intel processors, but the level of support for
a particular processor model in it depends on whether or not it recognizes that
processor model and may also depend on information coming from the platform
firmware. [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
works in general, so this is the time to get familiar with :doc:`cpuidle` if you
have not done that yet.]
``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
logical CPU executing it is idle and so it may be possible to put some of the
processor's functional blocks into low-power states. That instruction takes two
arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
first of which, referred to as a *hint*, can be used by the processor to
determine what can be done (for details refer to Intel Software Developers
Manual [1]_). Accordingly, ``intel_idle`` refuses to work with processors in
which the support for the ``MWAIT`` instruction has been disabled (for example,
via the platform firmware configuration menu) or which do not support that
instruction at all.
``intel_idle`` is not modular, so it cannot be unloaded, which means that the
only way to pass early-configuration-time parameters to it is via the kernel
command line.
.. _intel-idle-enumeration-of-states:
Enumeration of Idle States
==========================
Each ``MWAIT`` hint value is interpreted by the processor as a license to
reconfigure itself in a certain way in order to save energy. The processor
configurations (with reduced power draw) resulting from that are referred to
as C-states (in the ACPI terminology) or idle states. The list of meaningful
``MWAIT`` hint values and idle states (i.e. low-power configurations of the
processor) corresponding to them depends on the processor model and it may also
depend on the configuration of the platform.
In order to create a list of available idle states required by the ``CPUIdle``
subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
``intel_idle`` can use two sources of information: static tables of idle states
for different processor models included in the driver itself and the ACPI tables
of the system. The former are always used if the processor model at hand is
recognized by ``intel_idle`` and the latter are used if that is required for
the given processor model (which is the case for all server processor models
recognized by ``intel_idle``) or if the processor model is not recognized.
[There is a module parameter that can be used to make the driver use the ACPI
tables with any processor model recognized by it; see
`below <intel-idle-parameters_>`_.]
If the ACPI tables are going to be used for building the list of available idle
states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
objects corresponding to the CPUs in the system (refer to the ACPI specification
[2]_ for the description of ``_CST`` and its output package). Because the
``CPUIdle`` subsystem expects that the list of idle states supplied by the
driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
driver looks for the first ``_CST`` object returning at least one valid idle
state description and such that all of the idle states included in its return
package are of the FFH (Functional Fixed Hardware) type, which means that the
``MWAIT`` instruction is expected to be used to tell the processor that it can
enter one of them. The return package of that ``_CST`` is then assumed to be
applicable to all of the other CPUs in the system and the idle state
descriptions extracted from it are stored in a preliminary list of idle states
coming from the ACPI tables. [This step is skipped if ``intel_idle`` is
configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
Next, the first (index 0) entry in the list of available idle states is
initialized to represent a "polling idle state" (a pseudo-idle state in which
the target CPU continuously fetches and executes instructions), and the
subsequent (real) idle state entries are populated as follows.
If the processor model at hand is recognized by ``intel_idle``, there is a
(static) table of idle state descriptions for it in the driver. In that case,
the "internal" table is the primary source of information on idle states and the
information from it is copied to the final list of available idle states. If
using the ACPI tables for the enumeration of idle states is not required
(depending on the processor model), all of the listed idle state are enabled by
default (so all of them will be taken into consideration by ``CPUIdle``
governors during CPU idle state selection). Otherwise, some of the listed idle
states may not be enabled by default if there are no matching entries in the
preliminary list of idle states coming from the ACPI tables. In that case user
space still can enable them later (on a per-CPU basis) with the help of
the ``disable`` idle state attribute in ``sysfs`` (see
:ref:`idle-states-representation` in :doc:`cpuidle`). This basically means that
the idle states "known" to the driver may not be enabled by default if they have
not been exposed by the platform firmware (through the ACPI tables).
If the given processor model is not recognized by ``intel_idle``, but it
supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
tables is used for building the final list that will be supplied to the
``CPUIdle`` core during driver registration. For each idle state in that list,
the description, ``MWAIT`` hint and exit latency are copied to the corresponding
entry in the final list of idle states. The name of the idle state represented
by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
"CX_ACPI", where X is the index of that idle state in the final list (note that
the minimum value of X is 1, because 0 is reserved for the "polling" state), and
its target residency is based on the exit latency value. Specifically, for
C1-type idle states the exit latency value is also used as the target residency
(for compatibility with the majority of the "internal" tables of idle states for
various processor models recognized by ``intel_idle``) and for the other idle
state types (C2 and C3) the target residency value is 3 times the exit latency
(again, that is because it reflects the target residency to exit latency ratio
in the majority of cases for the processor models recognized by ``intel_idle``).
All of the idle states in the final list are enabled by default in this case.
.. _intel-idle-initialization:
Initialization
==============
The initialization of ``intel_idle`` starts with checking if the kernel command
line options forbid the use of the ``MWAIT`` instruction. If that is the case,
an error code is returned right away.
The next step is to check whether or not the processor model is known to the
driver, which determines the idle states enumeration method (see
`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
supports ``MWAIT`` (the initialization fails if that is not the case). Then,
the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
driver initialization fails if the level of support is not as expected (for
example, if the total number of ``MWAIT`` substates returned is 0).
Next, if the driver is not configured to ignore the ACPI tables (see
`below <intel-idle-parameters_>`_), the idle states information provided by the
platform firmware is extracted from them.
Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
available idle states is created as explained
`above <intel-idle-enumeration-of-states_>`_.
Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
for configuring individual CPUs is registered via cpuhp_setup_state(), which
(among other things) causes the callback routine to be invoked for all of the
CPUs present in the system at that time (each CPU executes its own instance of
the callback routine). That routine registers a ``CPUIdle`` device for the CPU
running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
optionally performs some CPU-specific initialization actions that may be
required for the given processor model.
.. _intel-idle-parameters:
Kernel Command Line Options and Module Parameters
=================================================
The *x86* architecture support code recognizes three kernel command line
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
and ``idle=nomwait``. If any of them is present in the kernel command line, the
``MWAIT`` instruction is not allowed to be used, so the initialization of
``intel_idle`` will fail.
Apart from that there are four module parameters recognized by ``intel_idle``
itself that can be set via the kernel command line (they cannot be updated via
sysfs, so that is the only way to change their values).
The ``max_cstate`` parameter value is the maximum idle state index in the list
of idle states supplied to the ``CPUIdle`` core during the registration of the
driver. It is also the maximum number of regular (non-polling) idle states that
can be used by ``intel_idle``, so the enumeration of idle states is terminated
after finding that number of usable idle states (the other idle states that
potentially might have been used if ``max_cstate`` had been greater are not
taken into consideration at all). Setting ``max_cstate`` can prevent
``intel_idle`` from exposing idle states that are regarded as "too deep" for
some reason to the ``CPUIdle`` core, but it does so by making them effectively
invisible until the system is shut down and started again which may not always
be desirable. In practice, it is only really necessary to do that if the idle
states in question cannot be enabled during system startup, because in the
working state of the system the CPU power management quality of service (PM
QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
if the kernel has been configured with ACPI support) can be set to make the
driver ignore the system's ACPI tables entirely or use them for all of the
recognized processor models, respectively (they both are unset by default and
``use_acpi`` has no effect if ``no_acpi`` is set).
The value of the ``states_off`` module parameter (0 by default) represents a
list of idle states to be disabled by default in the form of a bitmask.
Namely, the positions of the bits that are set in the ``states_off`` value are
the indices of idle states to be disabled by default (as reflected by the names
of the corresponding idle state directories in ``sysfs``, :file:`state0`,
:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
For example, if ``states_off`` is equal to 3, the driver will disable idle
states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
disabled by default and so on (bit positions beyond the maximum idle state index
are ignored).
The idle states disabled this way can be enabled (on a per-CPU basis) from user
space via ``sysfs``.
.. _intel-idle-core-and-package-idle-states:
Core and Package Levels of Idle States
======================================
Typically, in a processor supporting the ``MWAIT`` instruction there are (at
least) two levels of idle states (or C-states). One level, referred to as
"core C-states", covers individual cores in the processor, whereas the other
level, referred to as "package C-states", covers the entire processor package
and it may also involve other components of the system (GPUs, memory
controllers, I/O hubs etc.).
Some of the ``MWAIT`` hint values allow the processor to use core C-states only
(most importantly, that is the case for the ``MWAIT`` hint value corresponding
to the ``C1`` idle state), but the majority of them give it a license to put
the target core (i.e. the core containing the logical CPU executing ``MWAIT``
with the given hint value) into a specific core C-state and then (if possible)
to enter a specific package C-state at the deeper level. For example, the
``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
put the target core into the low-power state referred to as "core ``C3``" (or
``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
representing a deeper idle state), and in addition to that (in the majority of
cases) it gives the processor a license to put the entire package (possibly
including some non-CPU components such as a GPU or a memory controller) into the
low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
all of the cores have gone into the ``CC3`` state and (possibly) some additional
conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
be required to be in a certain GPU-specific low-power state for ``PC3`` to be
reachable).
As a rule, there is no simple way to make the processor use core C-states only
if the conditions for entering the corresponding package C-states are met, so
the logical CPU executing ``MWAIT`` with a hint value that is not core-level
only (like for ``C1``) must always assume that this may cause the processor to
enter a package C-state. [That is why the exit latency and target residency
values corresponding to the majority of ``MWAIT`` hint values in the "internal"
tables of idle states in ``intel_idle`` reflect the properties of package
C-states.] If using package C-states is not desirable at all, either
:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
restrict the range of permissible idle states to the ones with core-level only
``MWAIT`` hint values (like ``C1``).
References
==========
.. [1] *Intel® 64 and IA-32 Architectures Software Developers Manual Volume 2B*,
https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
https://uefi.org/specifications

View File

@@ -153,8 +153,11 @@ for the given CPU architecture includes the low-level code for system resume.
Basic ``sysfs`` Interfaces for System Suspend and Hibernation Basic ``sysfs`` Interfaces for System Suspend and Hibernation
============================================================= =============================================================
The following files located in the :file:`/sys/power/` directory can be used by The power management subsystem provides userspace with a unified ``sysfs``
user space for sleep states control. interface for system sleep regardless of the underlying system architecture or
platform. That interface is located in the :file:`/sys/power/` directory
(assuming that ``sysfs`` is mounted at :file:`/sys`) and it consists of the
following attributes (files):
``state`` ``state``
This file contains a list of strings representing sleep states supported This file contains a list of strings representing sleep states supported
@@ -162,9 +165,9 @@ user space for sleep states control.
to start a transition of the system into the sleep state represented by to start a transition of the system into the sleep state represented by
that string. that string.
In particular, the strings "disk", "freeze" and "standby" represent the In particular, the "disk", "freeze" and "standby" strings represent the
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and :ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
:ref:`standby <standby>` sleep states, respectively. The string "mem" :ref:`standby <standby>` sleep states, respectively. The "mem" string
is interpreted in accordance with the contents of the ``mem_sleep`` file is interpreted in accordance with the contents of the ``mem_sleep`` file
described below. described below.
@@ -177,7 +180,7 @@ user space for sleep states control.
associated with the "mem" string in the ``state`` file described above. associated with the "mem" string in the ``state`` file described above.
The strings that may be present in this file are "s2idle", "shallow" The strings that may be present in this file are "s2idle", "shallow"
and "deep". The string "s2idle" always represents :ref:`suspend-to-idle and "deep". The "s2idle" string always represents :ref:`suspend-to-idle
<s2idle>` and, by convention, "shallow" and "deep" represent <s2idle>` and, by convention, "shallow" and "deep" represent
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`, :ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
respectively. respectively.
@@ -185,15 +188,17 @@ user space for sleep states control.
Writing one of the listed strings into this file causes the system Writing one of the listed strings into this file causes the system
suspend variant represented by it to be associated with the "mem" string suspend variant represented by it to be associated with the "mem" string
in the ``state`` file. The string representing the suspend variant in the ``state`` file. The string representing the suspend variant
currently associated with the "mem" string in the ``state`` file currently associated with the "mem" string in the ``state`` file is
is listed in square brackets. shown in square brackets.
If the kernel does not support system suspend, this file is not present. If the kernel does not support system suspend, this file is not present.
``disk`` ``disk``
This file contains a list of strings representing different operations This file controls the operating mode of hibernation (Suspend-to-Disk).
that can be carried out after the hibernation image has been saved. The Specifically, it tells the kernel what to do after creating a
possible options are as follows: hibernation image.
Reading from it returns a list of supported options encoded as:
``platform`` ``platform``
Put the system into a special low-power state (e.g. ACPI S4) to Put the system into a special low-power state (e.g. ACPI S4) to
@@ -201,6 +206,11 @@ user space for sleep states control.
platform firmware to take a simplified initialization path after platform firmware to take a simplified initialization path after
wakeup. wakeup.
It is only available if the platform provides a special
mechanism to put the system to sleep after creating a
hibernation image (platforms with ACPI do that as a rule, for
example).
``shutdown`` ``shutdown``
Power off the system. Power off the system.
@@ -214,22 +224,53 @@ user space for sleep states control.
the hibernation image and continue. Otherwise, use the image the hibernation image and continue. Otherwise, use the image
to restore the previous state of the system. to restore the previous state of the system.
It is available if system suspend is supported.
``test_resume`` ``test_resume``
Diagnostic operation. Load the image as though the system had Diagnostic operation. Load the image as though the system had
just woken up from hibernation and the currently running kernel just woken up from hibernation and the currently running kernel
instance was a restore kernel and follow up with full system instance was a restore kernel and follow up with full system
resume. resume.
Writing one of the listed strings into this file causes the option Writing one of the strings listed above into this file causes the option
represented by it to be selected. represented by it to be selected.
The currently selected option is shown in square brackets which means The currently selected option is shown in square brackets, which means
that the operation represented by it will be carried out after creating that the operation represented by it will be carried out after creating
and saving the image next time hibernation is triggered by writing and saving the image when hibernation is triggered by writing ``disk``
``disk`` to :file:`/sys/power/state`. to :file:`/sys/power/state`.
If the kernel does not support hibernation, this file is not present. If the kernel does not support hibernation, this file is not present.
``image_size``
This file controls the size of hibernation images.
It can be written a string representing a non-negative integer that will
be used as a best-effort upper limit of the image size, in bytes. The
hibernation core will do its best to ensure that the image size will not
exceed that number, but if that turns out to be impossible to achieve, a
hibernation image will still be created and its size will be as small as
possible. In particular, writing '0' to this file causes the size of
hibernation images to be minimum.
Reading from it returns the current image size limit, which is set to
around 2/5 of the available RAM size by default.
``pm_trace``
This file controls the "PM trace" mechanism saving the last suspend
or resume event point in the RTC memory across reboots. It helps to
debug hard lockups or reboots due to device driver failures that occur
during system suspend or resume (which is more common) more effectively.
If it contains "1", the fingerprint of each suspend/resume event point
in turn will be stored in the RTC memory (overwriting the actual RTC
information), so it will survive a system crash if one occurs right
after storing it and it can be used later to identify the driver that
caused the crash to happen.
It contains "0" by default, which may be changed to "1" by writing a
string representing a nonzero integer into it.
According to the above, there are two ways to make the system go into the According to the above, there are two ways to make the system go into the
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze" :ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
@@ -244,6 +285,7 @@ system go into the :ref:`suspend-to-RAM <s2ram>` state (write "deep" into
The default suspend variant (ie. the one to be used without writing anything The default suspend variant (ie. the one to be used without writing anything
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
by the value of the "mem_sleep_default" parameter in the kernel command line. by the value of the ``mem_sleep_default`` parameter in the kernel command line.
On some ACPI-based systems, depending on the information in the ACPI tables, the On some systems with ACPI, depending on the information in the ACPI tables, the
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported. default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported in
principle.

View File

@@ -8,6 +8,7 @@ Working-State Power Management
:maxdepth: 2 :maxdepth: 2
cpuidle cpuidle
intel_idle
cpufreq cpufreq
intel_pstate intel_pstate
intel_epb intel_epb

View File

@@ -1,6 +1,28 @@
============= .. SPDX-License-Identifier: GPL-2.0
Thunderbolt
============= ======================
USB4 and Thunderbolt
======================
USB4 is the public specification based on Thunderbolt 3 protocol with
some differences at the register level among other things. Connection
manager is an entity running on the host router (host controller)
responsible for enumerating routers and establishing tunnels. A
connection manager can be implemented either in firmware or software.
Typically PCs come with a firmware connection manager for Thunderbolt 3
and early USB4 capable systems. Apple systems on the other hand use
software connection manager and the later USB4 compliant devices follow
the suit.
The Linux Thunderbolt driver supports both and can detect at runtime which
connection manager implementation is to be used. To be on the safe side the
software connection manager in Linux also advertises security level
``user`` which means PCIe tunneling is disabled by default. The
documentation below applies to both implementations with the exception that
the software connection manager only supports ``user`` security level and
is expected to be accompanied with an IOMMU based DMA protection.
Security levels and how to use them
-----------------------------------
The interface presented here is not meant for end users. Instead there The interface presented here is not meant for end users. Instead there
should be a userspace tool that handles all the low-level details, keeps should be a userspace tool that handles all the low-level details, keeps
a database of the authorized devices and prompts users for new connections. a database of the authorized devices and prompts users for new connections.
@@ -18,8 +40,6 @@ This will authorize all devices automatically when they appear. However,
keep in mind that this bypasses the security levels and makes the system keep in mind that this bypasses the security levels and makes the system
vulnerable to DMA attacks. vulnerable to DMA attacks.
Security levels and how to use them
-----------------------------------
Starting with Intel Falcon Ridge Thunderbolt controller there are 4 Starting with Intel Falcon Ridge Thunderbolt controller there are 4
security levels available. Intel Titan Ridge added one more security level security levels available. Intel Titan Ridge added one more security level
(usbonly). The reason for these is the fact that the connected devices can (usbonly). The reason for these is the fact that the connected devices can

View File

@@ -92,6 +92,12 @@ the Microchip website: http://www.microchip.com.
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
- sam9x60
* Datasheet
http://ww1.microchip.com/downloads/en/DeviceDoc/SAM9X60-Data-Sheet-DS60001579A.pdf
* ARM Cortex-A5 based SoCs * ARM Cortex-A5 based SoCs
- sama5d3 family - sama5d3 family

View File

@@ -129,7 +129,7 @@ this logic.
As a single binary will need to support both 48-bit and 52-bit VA As a single binary will need to support both 48-bit and 52-bit VA
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
also must be sized large enought to accommodate a fixed PAGE_OFFSET. also must be sized large enough to accommodate a fixed PAGE_OFFSET.
Most code in the kernel should not need to consider the VA_BITS, for Most code in the kernel should not need to consider the VA_BITS, for
code that does need to know the VA size the variables are code that does need to know the VA size the variables are

View File

@@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending
how the user addresses are used by the kernel: how the user addresses are used by the kernel:
1. User addresses not accessed by the kernel but used for address space 1. User addresses not accessed by the kernel but used for address space
management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use management (e.g. ``mprotect()``, ``madvise()``). The use of valid
of valid tagged pointers in this context is always allowed. tagged pointers in this context is allowed with the exception of
``brk()``, ``mmap()`` and the ``new_address`` argument to
``mremap()`` as these have the potential to alias with existing
user addresses.
NOTE: This behaviour changed in v5.6 and so some earlier kernels may
incorrectly accept valid tagged pointers for the ``brk()``,
``mmap()`` and ``mremap()`` system calls.
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
relaxation is disabled by default and the application thread needs to relaxation is disabled by default and the application thread needs to

View File

@@ -73,10 +73,11 @@ The new macros are prefixed with the ``SYM_`` prefix and can be divided into
three main groups: three main groups:
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with 1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
standard C calling conventions, i.e. the stack contains a return address at standard C calling conventions. For example, on x86, this means that the
the predefined place and a return from the function can happen in a stack contains a return address at the predefined place and a return from
standard way. When frame pointers are enabled, save/restore of frame the function can happen in a standard way. When frame pointers are enabled,
pointer shall happen at the start/end of a function, respectively, too. save/restore of frame pointer shall happen at the start/end of a function,
respectively, too.
Checking tools like ``objtool`` should ensure such marked functions conform Checking tools like ``objtool`` should ensure such marked functions conform
to these rules. The tools can also easily annotate these functions with to these rules. The tools can also easily annotate these functions with

View File

@@ -47,7 +47,7 @@ Having a real iterator, and making biovecs immutable, has a number of
advantages: advantages:
* Before, iterating over bios was very awkward when you weren't processing * Before, iterating over bios was very awkward when you weren't processing
exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c, exactly one bvec at a time - for example, bio_copy_data() in block/bio.c,
which copies the contents of one bio into another. Because the biovecs which copies the contents of one bio into another. Because the biovecs
wouldn't necessarily be the same size, the old code was tricky convoluted - wouldn't necessarily be the same size, the old code was tricky convoluted -
it had to walk two different bios at the same time, keeping both bi_idx and it had to walk two different bios at the same time, keeping both bi_idx and

View File

@@ -31,6 +31,7 @@ Core utilities
generic-radix-tree generic-radix-tree
memory-allocation memory-allocation
mm-api mm-api
pin_user_pages
gfp_mask-from-fs-io gfp_mask-from-fs-io
timekeeping timekeeping
boot-time-mm boot-time-mm
@@ -39,6 +40,8 @@ Core utilities
../RCU/index ../RCU/index
gcc-plugins gcc-plugins
symbol-namespaces symbol-namespaces
padata
ioctl
Interfaces for kernel debugging Interfaces for kernel debugging

View File

@@ -0,0 +1,253 @@
======================
ioctl based interfaces
======================
ioctl() is the most common way for applications to interface
with device drivers. It is flexible and easily extended by adding new
commands and can be passed through character devices, block devices as
well as sockets and other special file descriptors.
However, it is also very easy to get ioctl command definitions wrong,
and hard to fix them later without breaking existing applications,
so this documentation tries to help developers get it right.
Command number definitions
==========================
The command number, or request number, is the second argument passed to
the ioctl system call. While this can be any 32-bit number that uniquely
identifies an action for a particular driver, there are a number of
conventions around defining them.
``include/uapi/asm-generic/ioctl.h`` provides four macros for defining
ioctl commands that follow modern conventions: ``_IO``, ``_IOR``,
``_IOW``, and ``_IOWR``. These should be used for all new commands,
with the correct parameters:
_IO/_IOR/_IOW/_IOWR
The macro name specifies how the argument will be used.  It may be a
pointer to data to be passed into the kernel (_IOW), out of the kernel
(_IOR), or both (_IOWR).  _IO can indicate either commands with no
argument or those passing an integer value instead of a pointer.
It is recommended to only use _IO for commands without arguments,
and use pointers for passing data.
type
An 8-bit number, often a character literal, specific to a subsystem
or driver, and listed in :doc:`../userspace-api/ioctl/ioctl-number`
nr
An 8-bit number identifying the specific command, unique for a give
value of 'type'
data_type
The name of the data type pointed to by the argument, the command number
encodes the ``sizeof(data_type)`` value in a 13-bit or 14-bit integer,
leading to a limit of 8191 bytes for the maximum size of the argument.
Note: do not pass sizeof(data_type) type into _IOR/_IOW/IOWR, as that
will lead to encoding sizeof(sizeof(data_type)), i.e. sizeof(size_t).
_IO does not have a data_type parameter.
Interface versions
==================
Some subsystems use version numbers in data structures to overload
commands with different interpretations of the argument.
This is generally a bad idea, since changes to existing commands tend
to break existing applications.
A better approach is to add a new ioctl command with a new number. The
old command still needs to be implemented in the kernel for compatibility,
but this can be a wrapper around the new implementation.
Return code
===========
ioctl commands can return negative error codes as documented in errno(3);
these get turned into errno values in user space. On success, the return
code should be zero. It is also possible but not recommended to return
a positive 'long' value.
When the ioctl callback is called with an unknown command number, the
handler returns either -ENOTTY or -ENOIOCTLCMD, which also results in
-ENOTTY being returned from the system call. Some subsystems return
-ENOSYS or -EINVAL here for historic reasons, but this is wrong.
Prior to Linux 5.5, compat_ioctl handlers were required to return
-ENOIOCTLCMD in order to use the fallback conversion into native
commands. As all subsystems are now responsible for handling compat
mode themselves, this is no longer needed, but it may be important to
consider when backporting bug fixes to older kernels.
Timestamps
==========
Traditionally, timestamps and timeout values are passed as ``struct
timespec`` or ``struct timeval``, but these are problematic because of
incompatible definitions of these structures in user space after the
move to 64-bit time_t.
The ``struct __kernel_timespec`` type can be used instead to be embedded
in other data structures when separate second/nanosecond values are
desired, or passed to user space directly. This is still not ideal though,
as the structure matches neither the kernel's timespec64 nor the user
space timespec exactly. The get_timespec64() and put_timespec64() helper
functions can be used to ensure that the layout remains compatible with
user space and the padding is treated correctly.
As it is cheap to convert seconds to nanoseconds, but the opposite
requires an expensive 64-bit division, a simple __u64 nanosecond value
can be simpler and more efficient.
Timeout values and timestamps should ideally use CLOCK_MONOTONIC time,
as returned by ktime_get_ns() or ktime_get_ts64(). Unlike
CLOCK_REALTIME, this makes the timestamps immune from jumping backwards
or forwards due to leap second adjustments and clock_settime() calls.
ktime_get_real_ns() can be used for CLOCK_REALTIME timestamps that
need to be persistent across a reboot or between multiple machines.
32-bit compat mode
==================
In order to support 32-bit user space running on a 64-bit machine, each
subsystem or driver that implements an ioctl callback handler must also
implement the corresponding compat_ioctl handler.
As long as all the rules for data structures are followed, this is as
easy as setting the .compat_ioctl pointer to a helper function such as
compat_ptr_ioctl() or blkdev_compat_ptr_ioctl().
compat_ptr()
------------
On the s390 architecture, 31-bit user space has ambiguous representations
for data pointers, with the upper bit being ignored. When running such
a process in compat mode, the compat_ptr() helper must be used to
clear the upper bit of a compat_uptr_t and turn it into a valid 64-bit
pointer. On other architectures, this macro only performs a cast to a
``void __user *`` pointer.
In an compat_ioctl() callback, the last argument is an unsigned long,
which can be interpreted as either a pointer or a scalar depending on
the command. If it is a scalar, then compat_ptr() must not be used, to
ensure that the 64-bit kernel behaves the same way as a 32-bit kernel
for arguments with the upper bit set.
The compat_ptr_ioctl() helper can be used in place of a custom
compat_ioctl file operation for drivers that only take arguments that
are pointers to compatible data structures.
Structure layout
----------------
Compatible data structures have the same layout on all architectures,
avoiding all problematic members:
* ``long`` and ``unsigned long`` are the size of a register, so
they can be either 32-bit or 64-bit wide and cannot be used in portable
data structures. Fixed-length replacements are ``__s32``, ``__u32``,
``__s64`` and ``__u64``.
* Pointers have the same problem, in addition to requiring the
use of compat_ptr(). The best workaround is to use ``__u64``
in place of pointers, which requires a cast to ``uintptr_t`` in user
space, and the use of u64_to_user_ptr() in the kernel to convert
it back into a user pointer.
* On the x86-32 (i386) architecture, the alignment of 64-bit variables
is only 32-bit, but they are naturally aligned on most other
architectures including x86-64. This means a structure like::
struct foo {
__u32 a;
__u64 b;
__u32 c;
};
has four bytes of padding between a and b on x86-64, plus another four
bytes of padding at the end, but no padding on i386, and it needs a
compat_ioctl conversion handler to translate between the two formats.
To avoid this problem, all structures should have their members
naturally aligned, or explicit reserved fields added in place of the
implicit padding. The ``pahole`` tool can be used for checking the
alignment.
* On ARM OABI user space, structures are padded to multiples of 32-bit,
making some structs incompatible with modern EABI kernels if they
do not end on a 32-bit boundary.
* On the m68k architecture, struct members are not guaranteed to have an
alignment greater than 16-bit, which is a problem when relying on
implicit padding.
* Bitfields and enums generally work as one would expect them to,
but some properties of them are implementation-defined, so it is better
to avoid them completely in ioctl interfaces.
* ``char`` members can be either signed or unsigned, depending on
the architecture, so the __u8 and __s8 types should be used for 8-bit
integer values, though char arrays are clearer for fixed-length strings.
Information leaks
=================
Uninitialized data must not be copied back to user space, as this can
cause an information leak, which can be used to defeat kernel address
space layout randomization (KASLR), helping in an attack.
For this reason (and for compat support) it is best to avoid any
implicit padding in data structures.  Where there is implicit padding
in an existing structure, kernel drivers must be careful to fully
initialize an instance of the structure before copying it to user
space.  This is usually done by calling memset() before assigning to
individual members.
Subsystem abstractions
======================
While some device drivers implement their own ioctl function, most
subsystems implement the same command for multiple drivers. Ideally the
subsystem has an .ioctl() handler that copies the arguments from and
to user space, passing them into subsystem specific callback functions
through normal kernel pointers.
This helps in various ways:
* Applications written for one driver are more likely to work for
another one in the same subsystem if there are no subtle differences
in the user space ABI.
* The complexity of user space access and data structure layout is done
in one place, reducing the potential for implementation bugs.
* It is more likely to be reviewed by experienced developers
that can spot problems in the interface when the ioctl is shared
between multiple drivers than when it is only used in a single driver.
Alternatives to ioctl
=====================
There are many cases in which ioctl is not the best solution for a
problem. Alternatives include:
* System calls are a better choice for a system-wide feature that
is not tied to a physical device or constrained by the file system
permissions of a character device node
* netlink is the preferred way of configuring any network related
objects through sockets.
* debugfs is used for ad-hoc interfaces for debugging functionality
that does not need to be exposed as a stable interface to applications.
* sysfs is a good way to expose the state of an in-kernel object
that is not tied to a file descriptor.
* configfs can be used for more complex configuration than sysfs
* A custom file system can provide extra flexibility with a simple
user interface but adds a lot of complexity to the implementation.

View File

@@ -0,0 +1,169 @@
.. SPDX-License-Identifier: GPL-2.0
=======================================
The padata parallel execution mechanism
=======================================
:Date: December 2019
Padata is a mechanism by which the kernel can farm jobs out to be done in
parallel on multiple CPUs while retaining their ordering. It was developed for
use with the IPsec code, which needs to be able to perform encryption and
decryption on large numbers of packets without reordering those packets. The
crypto developers made a point of writing padata in a sufficiently general
fashion that it could be put to other uses as well.
Usage
=====
Initializing
------------
The first step in using padata is to set up a padata_instance structure for
overall control of how jobs are to be run::
#include <linux/padata.h>
struct padata_instance *padata_alloc_possible(const char *name);
'name' simply identifies the instance.
There are functions for enabling and disabling the instance::
int padata_start(struct padata_instance *pinst);
void padata_stop(struct padata_instance *pinst);
These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
not set, other functions will refuse to work. padata_start() returns zero on
success (flag set) or -EINVAL if the padata cpumask contains no active CPU
(flag not set). padata_stop() clears the flag and blocks until the padata
instance is unused.
Finally, complete padata initialization by allocating a padata_shell::
struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
A padata_shell is used to submit a job to padata and allows a series of such
jobs to be serialized independently. A padata_instance may have one or more
padata_shells associated with it, each allowing a separate series of jobs.
Modifying cpumasks
------------------
The CPUs used to run jobs can be changed in two ways, programatically with
padata_set_cpumask() or via sysfs. The former is defined::
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a
parallel cpumask describes which processors will be used to execute jobs
submitted to this instance in parallel and a serial cpumask defines which
processors are allowed to be used as the serialization callback processor.
cpumask specifies the new cpumask to use.
There may be sysfs files for an instance's cpumasks. For example, pcrypt's
live in /sys/kernel/pcrypt/<instance-name>. Within an instance's directory
there are two files, parallel_cpumask and serial_cpumask, and either cpumask
may be changed by echoing a bitmask into the file, for example::
echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask
Reading one of these files shows the user-supplied cpumask, which may be
different from the 'usable' cpumask.
Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks
and the 'usable' cpumasks. (Each pair consists of a parallel and a serial
cpumask.) The user-supplied cpumasks default to all possible CPUs on instance
allocation and may be changed as above. The usable cpumasks are always a
subset of the user-supplied cpumasks and contain only the online CPUs in the
user-supplied masks; these are the cpumasks padata actually uses. So it is
legal to supply a cpumask to padata that contains offline CPUs. Once an
offline CPU in the user-supplied cpumask comes online, padata is going to use
it.
Changing the CPU masks are expensive operations, so it should not be done with
great frequency.
Running A Job
-------------
Actually submitting work to the padata instance requires the creation of a
padata_priv structure, which represents one job::
struct padata_priv {
/* Other stuff here... */
void (*parallel)(struct padata_priv *padata);
void (*serial)(struct padata_priv *padata);
};
This structure will almost certainly be embedded within some larger
structure specific to the work to be done. Most of its fields are private to
padata, but the structure should be zeroed at initialisation time, and the
parallel() and serial() functions should be provided. Those functions will
be called in the process of getting the work done as we will see
momentarily.
The submission of the job is done with::
int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu);
The ps and padata structures must be set up as described above; cb_cpu
points to the preferred CPU to be used for the final callback when the job is
done; it must be in the current instance's CPU mask (if not the cb_cpu pointer
is updated to point to the CPU actually chosen). The return value from
padata_do_parallel() is zero on success, indicating that the job is in
progress. -EBUSY means that somebody, somewhere else is messing with the
instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the
serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped
instance.
Each job submitted to padata_do_parallel() will, in turn, be passed to
exactly one call to the above-mentioned parallel() function, on one CPU, so
true parallelism is achieved by submitting multiple jobs. parallel() runs with
software interrupts disabled and thus cannot sleep. The parallel()
function gets the padata_priv structure pointer as its lone parameter;
information about the actual work to be done is probably obtained by using
container_of() to find the enclosing structure.
Note that parallel() has no return value; the padata subsystem assumes that
parallel() will take responsibility for the job from this point. The job
need not be completed during this call, but, if parallel() leaves work
outstanding, it should be prepared to be called again with a new job before
the previous one completes.
Serializing Jobs
----------------
When a job does complete, parallel() (or whatever function actually finishes
the work) should inform padata of the fact with a call to::
void padata_do_serial(struct padata_priv *padata);
At some point in the future, padata_do_serial() will trigger a call to the
serial() function in the padata_priv structure. That call will happen on
the CPU requested in the initial call to padata_do_parallel(); it, too, is
run with local software interrupts disabled.
Note that this call may be deferred for a while since the padata code takes
pains to ensure that jobs are completed in the order in which they were
submitted.
Destroying
----------
Cleaning up a padata instance predictably involves calling the three free
functions that correspond to the allocation in reverse::
void padata_free_shell(struct padata_shell *ps);
void padata_stop(struct padata_instance *pinst);
void padata_free(struct padata_instance *pinst);
It is the user's responsibility to ensure all outstanding jobs are complete
before any of the above are called.
Interface
=========
.. kernel-doc:: include/linux/padata.h
.. kernel-doc:: kernel/padata.c

View File

@@ -0,0 +1,232 @@
.. SPDX-License-Identifier: GPL-2.0
====================================================
pin_user_pages() and related calls
====================================================
.. contents:: :local:
Overview
========
This document describes the following functions::
pin_user_pages()
pin_user_pages_fast()
pin_user_pages_remote()
Basic description of FOLL_PIN
=============================
FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*()
("gup") family of functions. FOLL_PIN has significant interactions and
interdependencies with FOLL_LONGTERM, so both are covered here.
FOLL_PIN is internal to gup, meaning that it should not appear at the gup call
sites. This allows the associated wrapper functions (pin_user_pages*() and
others) to set the correct combination of these flags, and to check for problems
as well.
FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites.
This is in order to avoid creating a large number of wrapper functions to cover
all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
that's a natural dividing line, and a good point to make separate wrapper calls.
In other words, use pin_user_pages*() for DMA-pinned pages, and
get_user_pages*() for other cases. There are four cases described later on in
this document, to further clarify that concept.
FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
multiple threads and call sites are free to pin the same struct pages, via both
FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the
other, not the struct page(s).
The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN
uses a different reference counting technique.
FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is,
FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.
Which flags are set by each wrapper
===================================
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
flags the caller provides. The caller is required to pass in a non-null struct
pages* array, and the function then pin pages by incrementing each by a special
value. For now, that value is +1, just like get_user_pages*().::
Function
--------
pin_user_pages FOLL_PIN is always set internally by this function.
pin_user_pages_fast FOLL_PIN is always set internally by this function.
pin_user_pages_remote FOLL_PIN is always set internally by this function.
For these get_user_pages*() functions, FOLL_GET might not even be specified.
Behavior is a little more complex than above. If FOLL_GET was *not* specified,
but the caller passed in a non-null struct pages* array, then the function
sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount
of each page by +1.::
Function
--------
get_user_pages FOLL_GET is sometimes set internally by this function.
get_user_pages_fast FOLL_GET is sometimes set internally by this function.
get_user_pages_remote FOLL_GET is sometimes set internally by this function.
Tracking dma-pinned pages
=========================
Some of the key design constraints, and solutions, for tracking dma-pinned
pages:
* An actual reference count, per struct page, is required. This is because
multiple processes may pin and unpin a page.
* False positives (reporting that a page is dma-pinned, when in fact it is not)
are acceptable, but false negatives are not.
* struct page may not be increased in size for this, and all fields are already
used.
* Given the above, we can overload the page->_refcount field by using, sort of,
the upper bits in that field for a dma-pinned count. "Sort of", means that,
rather than dividing page->_refcount into bit fields, we simple add a medium-
large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to
page->_refcount. This provides fuzzy behavior: if a page has get_page() called
on it 1024 times, then it will appear to have a single dma-pinned count.
And again, that's acceptable.
This also leads to limitations: there are only 31-10==21 bits available for a
counter that increments 10 bits at a time.
TODO: for 1GB and larger huge pages, this is cutting it close. That's because
when pin_user_pages() follows such pages, it increments the head page by "1"
(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
pin_user_pages()) for each tail page. So if you have a 1GB huge page:
* There are 256K (18 bits) worth of 4 KB tail pages.
* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
10 bits at a time)
* There are 21 - 18 == 3 bits available to count. Except that there aren't,
because you need to allow for a few normal get_page() calls on the head page,
as well. Fortunately, the approach of using addition, rather than "hard"
bitfields, within page->_refcount, allows for sharing these bits gracefully.
But we're still looking at about 8 references.
This, however, is a missing feature more than anything else, because it's easily
solved by addressing an obvious inefficiency in the original get_user_pages()
approach of retrieving pages: stop treating all the pages as if they were
PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
this, so some work is required. Once that's in place, this limitation mostly
disappears from view, because there will be ample refcounting range available.
* Callers must specifically request "dma-pinned tracking of pages". In other
words, just calling get_user_pages() will not suffice; a new set of functions,
pin_user_page() and related, must be used.
FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags
==========================================================
Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing
these categories:
CASE 1: Direct IO (DIO)
-----------------------
There are GUP references to pages that are serving
as DIO buffers. These buffers are needed for a relatively short time (so they
are not "long term"). No special synchronization with page_mkclean() or
munmap() is provided. Therefore, flags to set at the call site are: ::
FOLL_PIN
...but rather than setting FOLL_PIN directly, call sites should use one of
the pin_user_pages*() routines that set FOLL_PIN.
CASE 2: RDMA
------------
There are GUP references to pages that are serving as DMA
buffers. These buffers are needed for a long time ("long term"). No special
synchronization with page_mkclean() or munmap() is provided. Therefore, flags
to set at the call site are: ::
FOLL_PIN | FOLL_LONGTERM
NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's
because DAX pages do not have a separate page cache, and so "pinning" implies
locking down file system blocks, which is not (yet) supported in that way.
CASE 3: Hardware with page faulting support
-------------------------------------------
Here, a well-written driver doesn't normally need to pin pages at all. However,
if the driver does choose to do so, it can register MMU notifiers for the range,
and will be called back upon invalidation. Either way (avoiding page pinning, or
using MMU notifiers to unpin upon request), there is proper synchronization with
both filesystem and mm (page_mkclean(), munmap(), etc).
Therefore, neither flag needs to be set.
In this case, ideally, neither get_user_pages() nor pin_user_pages() should be
called. Instead, the software should be written so that it does not pin pages.
This allows mm and filesystems to operate more efficiently and reliably.
CASE 4: Pinning for struct page manipulation only
-------------------------------------------------
Here, normal GUP calls are sufficient, so neither flag needs to be set.
page_dma_pinned(): the whole point of pinning
=============================================
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
(and file system writeback code in general) to make informed decisions about
what to do when a page cannot be unmapped due to such pins.
What to do in those cases is the subject of a years-long series of discussions
and debates (see the References at the end of this document). It's a TODO item
here: fill in the details once that's worked out. Meanwhile, it's safe to say
that having this available: ::
static inline bool page_dma_pinned(struct page *page)
...is a prerequisite to solving the long-running gup+DMA problem.
Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM
===================================================================
Another way of thinking about these flags is as a progression of restrictions:
FOLL_GET is for struct page manipulation, without affecting the data that the
struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for
short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is
a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more
restrictive case that has FOLL_PIN as a prerequisite: this is for pages that
will be pinned longterm, and whose data will be accessed.
Unit testing
============
This file::
tools/testing/selftests/vm/gup_benchmark.c
has the following new calls to exercise the new pin*() wrapper functions:
* PIN_FAST_BENCHMARK (./gup_benchmark -a)
* PIN_BENCHMARK (./gup_benchmark -b)
You can monitor how many total dma-pinned pages have been acquired and released
since the system was booted, via two new /proc/vmstat entries: ::
/proc/vmstat/nr_foll_pin_requested
/proc/vmstat/nr_foll_pin_requested
Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
because there is a noticeable performance drop in unpin_user_page(), when they
are activated.
References
==========
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
John Hubbard, October, 2019

View File

@@ -31,33 +31,23 @@ The counterparts to those functions are listed below.
:: ::
int crypto_unregister_alg(struct crypto_alg *alg); void crypto_unregister_alg(struct crypto_alg *alg);
int crypto_unregister_algs(struct crypto_alg *algs, int count); void crypto_unregister_algs(struct crypto_alg *algs, int count);
Notice that both registration and unregistration functions do return a The registration functions return 0 on success, or a negative errno
value, so make sure to handle errors. A return code of zero implies value on failure. crypto_register_algs() succeeds only if it
success. Any return code < 0 implies an error. successfully registered all the given algorithms; if it fails partway
through, then any changes are rolled back.
The bulk registration/unregistration functions register/unregister each The unregistration functions always succeed, so they don't have a
transformation in the given array of length count. They handle errors as return value. Don't try to unregister algorithms that aren't
follows: currently registered.
- crypto_register_algs() succeeds if and only if it successfully
registers all the given transformations. If an error occurs partway
through, then it rolls back successful registrations before returning
the error code. Note that if a driver needs to handle registration
errors for individual transformations, then it will need to use the
non-bulk function crypto_register_alg() instead.
- crypto_unregister_algs() tries to unregister all the given
transformations, continuing on error. It logs errors and always
returns zero.
Single-Block Symmetric Ciphers [CIPHER] Single-Block Symmetric Ciphers [CIPHER]
--------------------------------------- ---------------------------------------
Example of transformations: aes, arc4, ... Example of transformations: aes, serpent, ...
This section describes the simplest of all transformation This section describes the simplest of all transformation
implementations, that being the CIPHER type used for symmetric ciphers. implementations, that being the CIPHER type used for symmetric ciphers.
@@ -108,7 +98,7 @@ is also valid:
Multi-Block Ciphers Multi-Block Ciphers
------------------- -------------------
Example of transformations: cbc(aes), ecb(arc4), ... Example of transformations: cbc(aes), chacha20, ...
This section describes the multi-block cipher transformation This section describes the multi-block cipher transformation
implementations. The multi-block ciphers are used for transformations implementations. The multi-block ciphers are used for transformations
@@ -169,10 +159,10 @@ are as follows:
:: ::
int crypto_unregister_ahash(struct ahash_alg *alg); void crypto_unregister_ahash(struct ahash_alg *alg);
int crypto_unregister_shash(struct shash_alg *alg); void crypto_unregister_shash(struct shash_alg *alg);
int crypto_unregister_shashes(struct shash_alg *algs, int count); void crypto_unregister_shashes(struct shash_alg *algs, int count);
Cipher Definition With struct shash_alg and ahash_alg Cipher Definition With struct shash_alg and ahash_alg

View File

@@ -21,8 +21,8 @@ global variables yet.
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later. Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390 Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and
architectures, and tag-based KASAN is supported only for arm64. riscv architectures, and tag-based KASAN is supported only for arm64.
Usage Usage
----- -----

View File

@@ -29,7 +29,8 @@ Yes, well, mostly.
For the most part, the KUnit core framework (what you use to write the tests) For the most part, the KUnit core framework (what you use to write the tests)
can compile to any architecture; it compiles like just another part of the can compile to any architecture; it compiles like just another part of the
kernel and runs when the kernel boots. However, there is some infrastructure, kernel and runs when the kernel boots, or when built as a module, when the
module is loaded. However, there is some infrastructure,
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
other architectures. other architectures.

View File

@@ -49,6 +49,9 @@ to a standalone program that can be run like any other program directly inside
of a host operating system; to be clear, it does not require any virtualization of a host operating system; to be clear, it does not require any virtualization
support; it is just a regular program. support; it is just a regular program.
Alternatively, kunit and kunit tests can be built as modules and tests will
run when the test module is loaded.
KUnit is fast. Excluding build time, from invocation to completion KUnit can run KUnit is fast. Excluding build time, from invocation to completion KUnit can run
several dozen tests in only 10 to 20 seconds; this might not sound like a big several dozen tests in only 10 to 20 seconds; this might not sound like a big
deal to some people, but having such fast and easy to run tests fundamentally deal to some people, but having such fast and easy to run tests fundamentally

View File

@@ -539,6 +539,23 @@ Interspersed in the kernel logs you might see the following:
Congratulations, you just ran a KUnit test on the x86 architecture! Congratulations, you just ran a KUnit test on the x86 architecture!
In a similar manner, kunit and kunit tests can also be built as modules,
so if you wanted to run tests in this way you might add the following config
options to your ``.config``:
.. code-block:: none
CONFIG_KUNIT=m
CONFIG_KUNIT_EXAMPLE_TEST=m
Once the kernel is built and installed, a simple
.. code-block:: bash
modprobe example-test
...will run the tests.
Writing new tests for other architectures Writing new tests for other architectures
----------------------------------------- -----------------------------------------

View File

@@ -59,6 +59,7 @@ properties:
- friendlyarm,nanopi-k2 - friendlyarm,nanopi-k2
- hardkernel,odroid-c2 - hardkernel,odroid-c2
- nexbox,a95x - nexbox,a95x
- videostrong,kii-pro
- wetek,hub - wetek,hub
- wetek,play2 - wetek,play2
- const: amlogic,meson-gxbb - const: amlogic,meson-gxbb
@@ -104,6 +105,7 @@ properties:
- enum: - enum:
- amlogic,p230 - amlogic,p230
- amlogic,p231 - amlogic,p231
- libretech,aml-s905d-pc
- phicomm,n1 - phicomm,n1
- const: amlogic,s905d - const: amlogic,s905d
- const: amlogic,meson-gxl - const: amlogic,meson-gxl
@@ -115,6 +117,7 @@ properties:
- amlogic,q201 - amlogic,q201
- khadas,vim2 - khadas,vim2
- kingnovel,r-box-pro - kingnovel,r-box-pro
- libretech,aml-s912-pc
- nexbox,a1 - nexbox,a1
- tronsmart,vega-s96 - tronsmart,vega-s96
- const: amlogic,s912 - const: amlogic,s912

View File

@@ -121,7 +121,7 @@ Required properties (in root node):
Required nodes: Required nodes:
- soc: some node of the RealView platforms must be the SoC - soc: some node of the RealView platforms must be the SoC
node that contain the SoC-specific devices, withe the compatible node that contain the SoC-specific devices, with the compatible
string set to one of these tuples: string set to one of these tuples:
"arm,realview-eb-soc", "simple-bus" "arm,realview-eb-soc", "simple-bus"
"arm,realview-pb1176-soc", "simple-bus" "arm,realview-pb1176-soc", "simple-bus"

View File

@@ -35,6 +35,16 @@ properties:
- atmel,at91sam9x60 - atmel,at91sam9x60
- const: atmel,at91sam9 - const: atmel,at91sam9
- items:
- enum:
- overkiz,kizboxmini-base # Overkiz kizbox Mini Base Board
- overkiz,kizboxmini-mb # Overkiz kizbox Mini Mother Board
- overkiz,kizboxmini-rd # Overkiz kizbox Mini RailDIN
- overkiz,smartkiz # Overkiz SmartKiz Board
- const: atmel,at91sam9g25
- const: atmel,at91sam9x5
- const: atmel,at91sam9
- items: - items:
- enum: - enum:
- atmel,at91sam9g15 - atmel,at91sam9g15
@@ -52,11 +62,32 @@ properties:
- const: atmel,sama5d2 - const: atmel,sama5d2
- const: atmel,sama5 - const: atmel,sama5
- description: Microchip SAMA5D27 WLSOM1
items:
- const: microchip,sama5d27-wlsom1
- const: atmel,sama5d27
- const: atmel,sama5d2
- const: atmel,sama5
- description: Microchip SAMA5D27 WLSOM1 Evaluation Kit
items:
- const: microchip,sama5d27-wlsom1-ek
- const: microchip,sama5d27-wlsom1
- const: atmel,sama5d27
- const: atmel,sama5d2
- const: atmel,sama5
- items: - items:
- const: atmel,sama5d27 - const: atmel,sama5d27
- const: atmel,sama5d2 - const: atmel,sama5d2
- const: atmel,sama5 - const: atmel,sama5
- description: SAM9X60-EK board
items:
- const: microchip,sam9x60ek
- const: microchip,sam9x60
- const: atmel,at91sam9
- description: Nattis v2 board with Natte v2 power board - description: Nattis v2 board with Natte v2 power board
items: items:
- const: axentia,nattis-2 - const: axentia,nattis-2

View File

@@ -10,6 +10,12 @@ PIT Timer required properties:
- interrupts: Should contain interrupt for the PIT which is the IRQ line - interrupts: Should contain interrupt for the PIT which is the IRQ line
shared across all System Controller members. shared across all System Controller members.
PIT64B Timer required properties:
- compatible: Should be "microchip,sam9x60-pit64b"
- reg: Should contain registers location and length
- interrupts: Should contain interrupt for PIT64B timer
- clocks: Should contain the available clock sources for PIT64B timer.
System Timer (ST) required properties: System Timer (ST) required properties:
- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd" - compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd"
- reg: Should contain registers location and length - reg: Should contain registers location and length
@@ -39,6 +45,7 @@ RAMC SDRAM/DDR Controller required properties:
"atmel,at91sam9260-sdramc", "atmel,at91sam9260-sdramc",
"atmel,at91sam9g45-ddramc", "atmel,at91sam9g45-ddramc",
"atmel,sama5d3-ddramc", "atmel,sama5d3-ddramc",
"microchip,sam9x60-ddramc"
- reg: Should contain registers location and length - reg: Should contain registers location and length
Examples: Examples:

View File

@@ -242,6 +242,21 @@ properties:
where voltage is in V, frequency is in MHz. where voltage is in V, frequency is in MHz.
power-domains:
$ref: '/schemas/types.yaml#/definitions/phandle-array'
description:
List of phandles and PM domain specifiers, as defined by bindings of the
PM domain provider (see also ../power_domain.txt).
power-domain-names:
$ref: '/schemas/types.yaml#/definitions/string-array'
description:
A list of power domain name strings sorted in the same order as the
power-domains property.
For PSCI based platforms, the name corresponding to the index of the PSCI
PM domain provider, must be "psci".
qcom,saw: qcom,saw:
$ref: '/schemas/types.yaml#/definitions/phandle' $ref: '/schemas/types.yaml#/definitions/phandle'
description: | description: |

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
%YAML 1.2 %YAML 1.2
--- ---
$id: http://devicetree.org/schemas/bindings/arm/fsl.yaml# $id: http://devicetree.org/schemas/arm/fsl.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Freescale i.MX Platforms Device Tree Bindings title: Freescale i.MX Platforms Device Tree Bindings
@@ -128,6 +128,27 @@ properties:
- variscite,dt6customboard - variscite,dt6customboard
- const: fsl,imx6q - const: fsl,imx6q
- description: i.MX6Q Gateworks Ventana Boards
items:
- enum:
- gw,imx6q-gw51xx
- gw,imx6q-gw52xx
- gw,imx6q-gw53xx
- gw,imx6q-gw5400-a
- gw,imx6q-gw54xx
- gw,imx6q-gw551x
- gw,imx6q-gw552x
- gw,imx6q-gw553x
- gw,imx6q-gw560x
- gw,imx6q-gw5903
- gw,imx6q-gw5904
- gw,imx6q-gw5907
- gw,imx6q-gw5910
- gw,imx6q-gw5912
- gw,imx6q-gw5913
- const: gw,ventana
- const: fsl,imx6q
- description: i.MX6QP based Boards - description: i.MX6QP based Boards
items: items:
- enum: - enum:
@@ -154,10 +175,31 @@ properties:
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board - ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
- const: fsl,imx6dl - const: fsl,imx6dl
- description: i.MX6DL Gateworks Ventana Boards
items:
- enum:
- gw,imx6dl-gw51xx
- gw,imx6dl-gw52xx
- gw,imx6dl-gw53xx
- gw,imx6dl-gw54xx
- gw,imx6dl-gw551x
- gw,imx6dl-gw552x
- gw,imx6dl-gw553x
- gw,imx6dl-gw560x
- gw,imx6dl-gw5903
- gw,imx6dl-gw5904
- gw,imx6dl-gw5907
- gw,imx6dl-gw5910
- gw,imx6dl-gw5912
- gw,imx6dl-gw5913
- const: gw,ventana
- const: fsl,imx6dl
- description: i.MX6SL based Boards - description: i.MX6SL based Boards
items: items:
- enum: - enum:
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board - fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
- kobo,tolino-shine3
- const: fsl,imx6sl - const: fsl,imx6sl
- description: i.MX6SLL based Boards - description: i.MX6SLL based Boards
@@ -172,6 +214,7 @@ properties:
- enum: - enum:
- fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board - fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board
- fsl,imx6sx-sdb # i.MX6 SoloX SDB Board - fsl,imx6sx-sdb # i.MX6 SoloX SDB Board
- fsl,imx6sx-sdb-reva # i.MX6 SoloX SDB Rev-A Board
- const: fsl,imx6sx - const: fsl,imx6sx
- description: i.MX6UL based Boards - description: i.MX6UL based Boards
@@ -239,6 +282,7 @@ properties:
items: items:
- enum: - enum:
- fsl,imx7d-sdb # i.MX7 SabreSD Board - fsl,imx7d-sdb # i.MX7 SabreSD Board
- fsl,imx7d-sdb-reva # i.MX7 SabreSD Rev-A Board
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board - novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
- toradex,colibri-imx7d # Colibri iMX7 Dual Module - toradex,colibri-imx7d # Colibri iMX7 Dual Module
- toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module - toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
@@ -263,6 +307,7 @@ properties:
- description: i.MX7ULP based Boards - description: i.MX7ULP based Boards
items: items:
- enum: - enum:
- ea,imx7ulp-com # i.MX7ULP Embedded Artists COM Board
- fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit - fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
- const: fsl,imx7ulp - const: fsl,imx7ulp
@@ -283,7 +328,9 @@ properties:
items: items:
- enum: - enum:
- boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board - boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board
- einfochips,imx8mq-thor96 # i.MX8MQ Thor96 Board
- fsl,imx8mq-evk # i.MX8MQ EVK Board - fsl,imx8mq-evk # i.MX8MQ EVK Board
- google,imx8mq-phanbell # Google Coral Edge TPU
- purism,librem5-devkit # Purism Librem5 devkit - purism,librem5-devkit # Purism Librem5 devkit
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse - solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk - technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
@@ -385,6 +432,13 @@ properties:
- fsl,ls2088a-rdb - fsl,ls2088a-rdb
- const: fsl,ls2088a - const: fsl,ls2088a
- description: LX2160A based Boards
items:
- enum:
- fsl,lx2160a-qds
- fsl,lx2160a-rdb
- const: fsl,lx2160a
- description: S32V234 based Boards - description: S32V234 based Boards
items: items:
- enum: - enum:

View File

@@ -1,706 +0,0 @@
==========================================
ARM idle states binding description
==========================================
==========================================
1 - Introduction
==========================================
ARM systems contain HW capable of managing power consumption dynamically,
where cores can be put in different low-power states (ranging from simple
wfi to power gating) according to OS PM policies. The CPU states representing
the range of dynamic idle states that a processor can enter at run-time, can be
specified through device tree bindings representing the parameters required
to enter/exit specific idle states on a given processor.
According to the Server Base System Architecture document (SBSA, [3]), the
power states an ARM CPU can be put into are identified by the following list:
- Running
- Idle_standby
- Idle_retention
- Sleep
- Off
The power states described in the SBSA document define the basic CPU states on
top of which ARM platforms implement power management schemes that allow an OS
PM implementation to put the processor in different idle states (which include
states listed above; "off" state is not an idle state since it does not have
wake-up capabilities, hence it is not considered in this document).
Idle state parameters (e.g. entry latency) are platform specific and need to be
characterized with bindings that provide the required information to OS PM
code so that it can build the required tables and use them at runtime.
The device tree binding definition for ARM idle states is the subject of this
document.
===========================================
2 - idle-states definitions
===========================================
Idle states are characterized for a specific system through a set of
timing and energy related properties, that underline the HW behaviour
triggered upon idle states entry and exit.
The following diagram depicts the CPU execution phases and related timing
properties required to enter and exit an idle state:
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
| | | | |
|<------ entry ------->|
| latency |
|<- exit ->|
| latency |
|<-------- min-residency -------->|
|<------- wakeup-latency ------->|
Diagram 1: CPU idle state execution phases
EXEC: Normal CPU execution.
PREP: Preparation phase before committing the hardware to idle mode
like cache flushing. This is abortable on pending wake-up
event conditions. The abort latency is assumed to be negligible
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
goes back to EXEC. This phase is optional. If not abortable,
this should be included in the ENTRY phase instead.
ENTRY: The hardware is committed to idle mode. This period must run
to completion up to IDLE before anything else can happen.
IDLE: This is the actual energy-saving idle period. This may last
between 0 and infinite time, until a wake-up event occurs.
EXIT: Period during which the CPU is brought back to operational
mode (EXEC).
entry-latency: Worst case latency required to enter the idle state. The
exit-latency may be guaranteed only after entry-latency has passed.
min-residency: Minimum period, including preparation and entry, for a given
idle state to be worthwhile energywise.
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
CPU being able to execute normal code again. If not specified, this is assumed
to be entry-latency + exit-latency.
These timing parameters can be used by an OS in different circumstances.
An idle CPU requires the expected min-residency time to select the most
appropriate idle state based on the expected expiry time of the next IRQ
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
An operating system scheduler may need to compute the shortest wake-up delay
for CPUs in the system by detecting how long will it take to get a CPU out
of an idle state, e.g.:
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
In other words, the scheduler can make its scheduling decision by selecting
(e.g. waking-up) the CPU with the shortest wake-up delay.
The wake-up delay must take into account the entry latency if that period
has not expired. The abortable nature of the PREP period can be ignored
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
the worst case since it depends on the CPU operating conditions, i.e. caches
state).
An OS has to reliably probe the wakeup-latency since some devices can enforce
latency constraint guarantees to work properly, so the OS has to detect the
worst case wake-up latency it can incur if a CPU is allowed to enter an
idle state, and possibly to prevent that to guarantee reliable device
functioning.
The min-residency time parameter deserves further explanation since it is
expressed in time units but must factor in energy consumption coefficients.
The energy consumption of a cpu when it enters a power state can be roughly
characterised by the following graph:
|
|
|
e |
n | /---
e | /------
r | /------
g | /-----
y | /------
| ----
| /|
| / |
| / |
| / |
| / |
| / |
|/ |
-----|-------+----------------------------------
0| 1 time(ms)
Graph 1: Energy vs time example
The graph is split in two parts delimited by time 1ms on the X-axis.
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
and denotes the energy costs incurred while entering and leaving the idle
state.
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
shallower slope and essentially represents the energy consumption of the idle
state.
min-residency is defined for a given idle state as the minimum expected
residency time for a state (inclusive of preparation and entry) after
which choosing that state become the most energy efficient option. A good
way to visualise this, is by taking the same graph above and comparing some
states energy consumptions plots.
For sake of simplicity, let's consider a system with two idle states IDLE1,
and IDLE2:
|
|
|
| /-- IDLE1
e | /---
n | /----
e | /---
r | /-----/--------- IDLE2
g | /-------/---------
y | ------------ /---|
| / /---- |
| / /--- |
| / /---- |
| / /--- |
| --- |
| / |
| / |
|/ | time
---/----------------------------+------------------------
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
IDLE2-min-residency
Graph 2: idle states min-residency example
In graph 2 above, that takes into account idle states entry/exit energy
costs, it is clear that if the idle state residency time (i.e. time till next
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
choice energywise.
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
than IDLE2.
However, the lower power consumption (i.e. shallower energy curve slope) of
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
efficient.
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
shallower states in a system with multiple idle states) is defined
IDLE2-min-residency and corresponds to the time when energy consumption of
IDLE1 and IDLE2 states breaks even.
The definitions provided in this section underpin the idle states
properties specification that is the subject of the following sections.
===========================================
3 - idle-states node
===========================================
ARM processor idle states are defined within the idle-states node, which is
a direct child of the cpus node [1] and provides a container where the
processor idle states, defined as device tree nodes, are listed.
- idle-states node
Usage: Optional - On ARM systems, it is a container of processor idle
states nodes. If the system does not provide CPU
power management capabilities, or the processor just
supports idle_standby, an idle-states node is not
required.
Description: idle-states node is a container node, where its
subnodes describe the CPU idle states.
Node name must be "idle-states".
The idle-states node's parent node must be the cpus node.
The idle-states node's child nodes can be:
- one or more state nodes
Any other configuration is considered invalid.
An idle-states node defines the following properties:
- entry-method
Value type: <stringlist>
Usage and definition depend on ARM architecture version.
# On ARM v8 64-bit this property is required and must
be:
- "psci"
# On ARM 32-bit systems this property is optional
This assumes that the "enable-method" property is set to "psci" in the cpu
node[6] that is responsible for setting up CPU idle management in the OS
implementation.
The nodes describing the idle states (state) can only be defined
within the idle-states node, any other configuration is considered invalid
and therefore must be ignored.
===========================================
4 - state node
===========================================
A state node represents an idle state description and must be defined as
follows:
- state node
Description: must be child of the idle-states node
The state node name shall follow standard device tree naming
rules ([5], 2.2.1 "Node names"), in particular state nodes which
are siblings within a single common parent must be given a unique name.
The idle state entered by executing the wfi instruction (idle_standby
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
must not be listed.
With the definitions provided above, the following list represents
the valid properties for a state node:
- compatible
Usage: Required
Value type: <stringlist>
Definition: Must be "arm,idle-state".
- local-timer-stop
Usage: See definition
Value type: <none>
Definition: if present the CPU local timer control logic is
lost on state entry, otherwise it is retained.
- entry-latency-us
Usage: Required
Value type: <prop-encoded-array>
Definition: u32 value representing worst case latency in
microseconds required to enter the idle state.
- exit-latency-us
Usage: Required
Value type: <prop-encoded-array>
Definition: u32 value representing worst case latency
in microseconds required to exit the idle state.
The exit-latency-us duration may be guaranteed
only after entry-latency-us has passed.
- min-residency-us
Usage: Required
Value type: <prop-encoded-array>
Definition: u32 value representing minimum residency duration
in microseconds, inclusive of preparation and
entry, for this idle state to be considered
worthwhile energy wise (refer to section 2 of
this document for a complete description).
- wakeup-latency-us:
Usage: Optional
Value type: <prop-encoded-array>
Definition: u32 value representing maximum delay between the
signaling of a wake-up event and the CPU being
able to execute normal code again. If omitted,
this is assumed to be equal to:
entry-latency-us + exit-latency-us
It is important to supply this value on systems
where the duration of PREP phase (see diagram 1,
section 2) is non-neglibigle.
In such systems entry-latency-us + exit-latency-us
will exceed wakeup-latency-us by this duration.
- status:
Usage: Optional
Value type: <string>
Definition: A standard device tree property [5] that indicates
the operational status of an idle-state.
If present, it shall be:
"okay": to indicate that the idle state is
operational.
"disabled": to indicate that the idle state has
been disabled in firmware so it is not
operational.
If the property is not present the idle-state must
be considered operational.
- idle-state-name:
Usage: Optional
Value type: <string>
Definition: A string used as a descriptive name for the idle
state.
In addition to the properties listed above, a state node may require
additional properties specific to the entry-method defined in the
idle-states node. Please refer to the entry-method bindings
documentation for properties definitions.
===========================================
4 - Examples
===========================================
Example 1 (ARM 64-bit, 16-cpu system, PSCI enable-method):
cpus {
#size-cells = <0>;
#address-cells = <2>;
CPU0: cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x0>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU1: cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x1>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU2: cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU3: cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU4: cpu@10000 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10000>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU5: cpu@10001 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10001>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU6: cpu@10100 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU7: cpu@10101 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
CPU8: cpu@100000000 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x0>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU9: cpu@100000001 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x1>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU10: cpu@100000100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU11: cpu@100000101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU12: cpu@100010000 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10000>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU13: cpu@100010001 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10001>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU14: cpu@100010100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
CPU15: cpu@100010101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
idle-states {
entry-method = "psci";
CPU_RETENTION_0_0: cpu-retention-0-0 {
compatible = "arm,idle-state";
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <20>;
exit-latency-us = <40>;
min-residency-us = <80>;
};
CLUSTER_RETENTION_0: cluster-retention-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <50>;
exit-latency-us = <100>;
min-residency-us = <250>;
wakeup-latency-us = <130>;
};
CPU_SLEEP_0_0: cpu-sleep-0-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <250>;
exit-latency-us = <500>;
min-residency-us = <950>;
};
CLUSTER_SLEEP_0: cluster-sleep-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <600>;
exit-latency-us = <1100>;
min-residency-us = <2700>;
wakeup-latency-us = <1500>;
};
CPU_RETENTION_1_0: cpu-retention-1-0 {
compatible = "arm,idle-state";
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <20>;
exit-latency-us = <40>;
min-residency-us = <90>;
};
CLUSTER_RETENTION_1: cluster-retention-1 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <50>;
exit-latency-us = <100>;
min-residency-us = <270>;
wakeup-latency-us = <100>;
};
CPU_SLEEP_1_0: cpu-sleep-1-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <70>;
exit-latency-us = <100>;
min-residency-us = <300>;
wakeup-latency-us = <150>;
};
CLUSTER_SLEEP_1: cluster-sleep-1 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <500>;
exit-latency-us = <1200>;
min-residency-us = <3500>;
wakeup-latency-us = <1300>;
};
};
};
Example 2 (ARM 32-bit, 8-cpu system, two clusters):
cpus {
#size-cells = <0>;
#address-cells = <1>;
CPU0: cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x0>;
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
};
CPU1: cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x1>;
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
};
CPU2: cpu@2 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x2>;
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
};
CPU3: cpu@3 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x3>;
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
};
CPU4: cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x100>;
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
};
CPU5: cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x101>;
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
};
CPU6: cpu@102 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x102>;
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
};
CPU7: cpu@103 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x103>;
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
};
idle-states {
CPU_SLEEP_0_0: cpu-sleep-0-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <200>;
exit-latency-us = <100>;
min-residency-us = <400>;
wakeup-latency-us = <250>;
};
CLUSTER_SLEEP_0: cluster-sleep-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <500>;
exit-latency-us = <1500>;
min-residency-us = <2500>;
wakeup-latency-us = <1700>;
};
CPU_SLEEP_1_0: cpu-sleep-1-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <300>;
exit-latency-us = <500>;
min-residency-us = <900>;
wakeup-latency-us = <600>;
};
CLUSTER_SLEEP_1: cluster-sleep-1 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <800>;
exit-latency-us = <2000>;
min-residency-us = <6500>;
wakeup-latency-us = <2300>;
};
};
};
===========================================
5 - References
===========================================
[1] ARM Linux Kernel documentation - CPUs bindings
Documentation/devicetree/bindings/arm/cpus.yaml
[2] ARM Linux Kernel documentation - PSCI bindings
Documentation/devicetree/bindings/arm/psci.yaml
[3] ARM Server Base System Architecture (SBSA)
http://infocenter.arm.com/help/index.jsp
[4] ARM Architecture Reference Manuals
http://infocenter.arm.com/help/index.jsp
[5] Devicetree Specification
https://www.devicetree.org/specifications/
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
Documentation/arm64/booting.rst

View File

@@ -0,0 +1,661 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/arm/idle-states.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: ARM idle states binding description
maintainers:
- Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
description: |+
==========================================
1 - Introduction
==========================================
ARM systems contain HW capable of managing power consumption dynamically,
where cores can be put in different low-power states (ranging from simple wfi
to power gating) according to OS PM policies. The CPU states representing the
range of dynamic idle states that a processor can enter at run-time, can be
specified through device tree bindings representing the parameters required to
enter/exit specific idle states on a given processor.
According to the Server Base System Architecture document (SBSA, [3]), the
power states an ARM CPU can be put into are identified by the following list:
- Running
- Idle_standby
- Idle_retention
- Sleep
- Off
The power states described in the SBSA document define the basic CPU states on
top of which ARM platforms implement power management schemes that allow an OS
PM implementation to put the processor in different idle states (which include
states listed above; "off" state is not an idle state since it does not have
wake-up capabilities, hence it is not considered in this document).
Idle state parameters (e.g. entry latency) are platform specific and need to
be characterized with bindings that provide the required information to OS PM
code so that it can build the required tables and use them at runtime.
The device tree binding definition for ARM idle states is the subject of this
document.
===========================================
2 - idle-states definitions
===========================================
Idle states are characterized for a specific system through a set of
timing and energy related properties, that underline the HW behaviour
triggered upon idle states entry and exit.
The following diagram depicts the CPU execution phases and related timing
properties required to enter and exit an idle state:
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
| | | | |
|<------ entry ------->|
| latency |
|<- exit ->|
| latency |
|<-------- min-residency -------->|
|<------- wakeup-latency ------->|
Diagram 1: CPU idle state execution phases
EXEC: Normal CPU execution.
PREP: Preparation phase before committing the hardware to idle mode
like cache flushing. This is abortable on pending wake-up
event conditions. The abort latency is assumed to be negligible
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
goes back to EXEC. This phase is optional. If not abortable,
this should be included in the ENTRY phase instead.
ENTRY: The hardware is committed to idle mode. This period must run
to completion up to IDLE before anything else can happen.
IDLE: This is the actual energy-saving idle period. This may last
between 0 and infinite time, until a wake-up event occurs.
EXIT: Period during which the CPU is brought back to operational
mode (EXEC).
entry-latency: Worst case latency required to enter the idle state. The
exit-latency may be guaranteed only after entry-latency has passed.
min-residency: Minimum period, including preparation and entry, for a given
idle state to be worthwhile energywise.
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
CPU being able to execute normal code again. If not specified, this is assumed
to be entry-latency + exit-latency.
These timing parameters can be used by an OS in different circumstances.
An idle CPU requires the expected min-residency time to select the most
appropriate idle state based on the expected expiry time of the next IRQ
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
An operating system scheduler may need to compute the shortest wake-up delay
for CPUs in the system by detecting how long will it take to get a CPU out
of an idle state, e.g.:
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
In other words, the scheduler can make its scheduling decision by selecting
(e.g. waking-up) the CPU with the shortest wake-up delay.
The wake-up delay must take into account the entry latency if that period
has not expired. The abortable nature of the PREP period can be ignored
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
the worst case since it depends on the CPU operating conditions, i.e. caches
state).
An OS has to reliably probe the wakeup-latency since some devices can enforce
latency constraint guarantees to work properly, so the OS has to detect the
worst case wake-up latency it can incur if a CPU is allowed to enter an
idle state, and possibly to prevent that to guarantee reliable device
functioning.
The min-residency time parameter deserves further explanation since it is
expressed in time units but must factor in energy consumption coefficients.
The energy consumption of a cpu when it enters a power state can be roughly
characterised by the following graph:
|
|
|
e |
n | /---
e | /------
r | /------
g | /-----
y | /------
| ----
| /|
| / |
| / |
| / |
| / |
| / |
|/ |
-----|-------+----------------------------------
0| 1 time(ms)
Graph 1: Energy vs time example
The graph is split in two parts delimited by time 1ms on the X-axis.
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
and denotes the energy costs incurred while entering and leaving the idle
state.
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
shallower slope and essentially represents the energy consumption of the idle
state.
min-residency is defined for a given idle state as the minimum expected
residency time for a state (inclusive of preparation and entry) after
which choosing that state become the most energy efficient option. A good
way to visualise this, is by taking the same graph above and comparing some
states energy consumptions plots.
For sake of simplicity, let's consider a system with two idle states IDLE1,
and IDLE2:
|
|
|
| /-- IDLE1
e | /---
n | /----
e | /---
r | /-----/--------- IDLE2
g | /-------/---------
y | ------------ /---|
| / /---- |
| / /--- |
| / /---- |
| / /--- |
| --- |
| / |
| / |
|/ | time
---/----------------------------+------------------------
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
IDLE2-min-residency
Graph 2: idle states min-residency example
In graph 2 above, that takes into account idle states entry/exit energy
costs, it is clear that if the idle state residency time (i.e. time till next
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
choice energywise.
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
than IDLE2.
However, the lower power consumption (i.e. shallower energy curve slope) of
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
efficient.
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
shallower states in a system with multiple idle states) is defined
IDLE2-min-residency and corresponds to the time when energy consumption of
IDLE1 and IDLE2 states breaks even.
The definitions provided in this section underpin the idle states
properties specification that is the subject of the following sections.
===========================================
3 - idle-states node
===========================================
ARM processor idle states are defined within the idle-states node, which is
a direct child of the cpus node [1] and provides a container where the
processor idle states, defined as device tree nodes, are listed.
On ARM systems, it is a container of processor idle states nodes. If the
system does not provide CPU power management capabilities, or the processor
just supports idle_standby, an idle-states node is not required.
===========================================
4 - References
===========================================
[1] ARM Linux Kernel documentation - CPUs bindings
Documentation/devicetree/bindings/arm/cpus.yaml
[2] ARM Linux Kernel documentation - PSCI bindings
Documentation/devicetree/bindings/arm/psci.yaml
[3] ARM Server Base System Architecture (SBSA)
http://infocenter.arm.com/help/index.jsp
[4] ARM Architecture Reference Manuals
http://infocenter.arm.com/help/index.jsp
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
Documentation/arm64/booting.rst
properties:
$nodename:
const: idle-states
entry-method:
description: |
Usage and definition depend on ARM architecture version.
On ARM v8 64-bit this property is required.
On ARM 32-bit systems this property is optional
This assumes that the "enable-method" property is set to "psci" in the cpu
node[6] that is responsible for setting up CPU idle management in the OS
implementation.
const: psci
patternProperties:
"^(cpu|cluster)-":
type: object
description: |
Each state node represents an idle state description and must be defined
as follows.
The idle state entered by executing the wfi instruction (idle_standby
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
must not be listed.
In addition to the properties listed above, a state node may require
additional properties specific to the entry-method defined in the
idle-states node. Please refer to the entry-method bindings
documentation for properties definitions.
properties:
compatible:
const: arm,idle-state
local-timer-stop:
description:
If present the CPU local timer control logic is
lost on state entry, otherwise it is retained.
type: boolean
entry-latency-us:
description:
Worst case latency in microseconds required to enter the idle state.
exit-latency-us:
description:
Worst case latency in microseconds required to exit the idle state.
The exit-latency-us duration may be guaranteed only after
entry-latency-us has passed.
min-residency-us:
description:
Minimum residency duration in microseconds, inclusive of preparation
and entry, for this idle state to be considered worthwhile energy wise
(refer to section 2 of this document for a complete description).
wakeup-latency-us:
description: |
Maximum delay between the signaling of a wake-up event and the CPU
being able to execute normal code again. If omitted, this is assumed
to be equal to:
entry-latency-us + exit-latency-us
It is important to supply this value on systems where the duration of
PREP phase (see diagram 1, section 2) is non-neglibigle. In such
systems entry-latency-us + exit-latency-us will exceed
wakeup-latency-us by this duration.
idle-state-name:
$ref: /schemas/types.yaml#definitions/string
description:
A string used as a descriptive name for the idle state.
required:
- compatible
- entry-latency-us
- exit-latency-us
- min-residency-us
additionalProperties: false
examples:
- |
cpus {
#size-cells = <0>;
#address-cells = <2>;
cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x0>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x1>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@10000 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10000>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@10001 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10001>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@10100 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@10101 {
device_type = "cpu";
compatible = "arm,cortex-a57";
reg = <0x0 0x10101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
};
cpu@100000000 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x0>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100000001 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x1>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100000100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100000101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100010000 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10000>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100010001 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10001>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100010100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10100>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
cpu@100010101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x1 0x10101>;
enable-method = "psci";
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
};
idle-states {
entry-method = "psci";
CPU_RETENTION_0_0: cpu-retention-0-0 {
compatible = "arm,idle-state";
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <20>;
exit-latency-us = <40>;
min-residency-us = <80>;
};
CLUSTER_RETENTION_0: cluster-retention-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <50>;
exit-latency-us = <100>;
min-residency-us = <250>;
wakeup-latency-us = <130>;
};
CPU_SLEEP_0_0: cpu-sleep-0-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <250>;
exit-latency-us = <500>;
min-residency-us = <950>;
};
CLUSTER_SLEEP_0: cluster-sleep-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <600>;
exit-latency-us = <1100>;
min-residency-us = <2700>;
wakeup-latency-us = <1500>;
};
CPU_RETENTION_1_0: cpu-retention-1-0 {
compatible = "arm,idle-state";
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <20>;
exit-latency-us = <40>;
min-residency-us = <90>;
};
CLUSTER_RETENTION_1: cluster-retention-1 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <50>;
exit-latency-us = <100>;
min-residency-us = <270>;
wakeup-latency-us = <100>;
};
CPU_SLEEP_1_0: cpu-sleep-1-0 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x0010000>;
entry-latency-us = <70>;
exit-latency-us = <100>;
min-residency-us = <300>;
wakeup-latency-us = <150>;
};
CLUSTER_SLEEP_1: cluster-sleep-1 {
compatible = "arm,idle-state";
local-timer-stop;
arm,psci-suspend-param = <0x1010000>;
entry-latency-us = <500>;
exit-latency-us = <1200>;
min-residency-us = <3500>;
wakeup-latency-us = <1300>;
};
};
};
- |
// Example 2 (ARM 32-bit, 8-cpu system, two clusters):
cpus {
#size-cells = <0>;
#address-cells = <1>;
cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x0>;
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
};
cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x1>;
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
};
cpu@2 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x2>;
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
};
cpu@3 {
device_type = "cpu";
compatible = "arm,cortex-a15";
reg = <0x3>;
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
};
cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x100>;
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
};
cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x101>;
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
};
cpu@102 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x102>;
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
};
cpu@103 {
device_type = "cpu";
compatible = "arm,cortex-a7";
reg = <0x103>;
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
};
idle-states {
cpu_sleep_0_0: cpu-sleep-0-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <200>;
exit-latency-us = <100>;
min-residency-us = <400>;
wakeup-latency-us = <250>;
};
cluster_sleep_0: cluster-sleep-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <500>;
exit-latency-us = <1500>;
min-residency-us = <2500>;
wakeup-latency-us = <1700>;
};
cpu_sleep_1_0: cpu-sleep-1-0 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <300>;
exit-latency-us = <500>;
min-residency-us = <900>;
wakeup-latency-us = <600>;
};
cluster_sleep_1: cluster-sleep-1 {
compatible = "arm,idle-state";
local-timer-stop;
entry-latency-us = <800>;
exit-latency-us = <2000>;
min-residency-us = <6500>;
wakeup-latency-us = <2300>;
};
};
};
...

View File

@@ -47,7 +47,7 @@ examples:
- | - |
#include <dt-bindings/interrupt-controller/arm-gic.h> #include <dt-bindings/interrupt-controller/arm-gic.h>
cache-controller@1100000 { system-cache-controller@1100000 {
compatible = "qcom,sdm845-llcc"; compatible = "qcom,sdm845-llcc";
reg = <0x1100000 0x200000>, <0x1300000 0x50000> ; reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
reg-names = "llcc_base", "llcc_broadcast_base"; reg-names = "llcc_base", "llcc_broadcast_base";

View File

@@ -102,6 +102,34 @@ properties:
[1] Kernel documentation - ARM idle states bindings [1] Kernel documentation - ARM idle states bindings
Documentation/devicetree/bindings/arm/idle-states.txt Documentation/devicetree/bindings/arm/idle-states.txt
"#power-domain-cells":
description:
The number of cells in a PM domain specifier as per binding in [3].
Must be 0 as to represent a single PM domain.
ARM systems can have multiple cores, sometimes in an hierarchical
arrangement. This often, but not always, maps directly to the processor
power topology of the system. Individual nodes in a topology have their
own specific power states and can be better represented hierarchically.
For these cases, the definitions of the idle states for the CPUs and the
CPU topology, must conform to the binding in [3]. The idle states
themselves must conform to the binding in [4] and must specify the
arm,psci-suspend-param property.
It should also be noted that, in PSCI firmware v1.0 the OS-Initiated
(OSI) CPU suspend mode is introduced. Using a hierarchical representation
helps to implement support for OSI mode and OS implementations may choose
to mandate it.
[3] Documentation/devicetree/bindings/power/power_domain.txt
[4] Documentation/devicetree/bindings/power/domain-idle-state.txt
power-domains:
$ref: '/schemas/types.yaml#/definitions/phandle-array'
description:
List of phandles and PM domain specifiers, as defined by bindings of the
PM domain provider.
required: required:
- compatible - compatible
@@ -160,4 +188,80 @@ examples:
cpu_on = <0x95c10002>; cpu_on = <0x95c10002>;
cpu_off = <0x95c10001>; cpu_off = <0x95c10001>;
}; };
- |+
// Case 4: CPUs and CPU idle states described using the hierarchical model.
cpus {
#size-cells = <0>;
#address-cells = <1>;
CPU0: cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a53", "arm,armv8";
reg = <0x0>;
enable-method = "psci";
power-domains = <&CPU_PD0>;
power-domain-names = "psci";
};
CPU1: cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a57", "arm,armv8";
reg = <0x100>;
enable-method = "psci";
power-domains = <&CPU_PD1>;
power-domain-names = "psci";
};
idle-states {
CPU_PWRDN: cpu-power-down {
compatible = "arm,idle-state";
arm,psci-suspend-param = <0x0000001>;
entry-latency-us = <10>;
exit-latency-us = <10>;
min-residency-us = <100>;
};
CLUSTER_RET: cluster-retention {
compatible = "domain-idle-state";
arm,psci-suspend-param = <0x1000011>;
entry-latency-us = <500>;
exit-latency-us = <500>;
min-residency-us = <2000>;
};
CLUSTER_PWRDN: cluster-power-down {
compatible = "domain-idle-state";
arm,psci-suspend-param = <0x1000031>;
entry-latency-us = <2000>;
exit-latency-us = <2000>;
min-residency-us = <6000>;
};
};
};
psci {
compatible = "arm,psci-1.0";
method = "smc";
CPU_PD0: cpu-pd0 {
#power-domain-cells = <0>;
domain-idle-states = <&CPU_PWRDN>;
power-domains = <&CLUSTER_PD>;
};
CPU_PD1: cpu-pd1 {
#power-domain-cells = <0>;
domain-idle-states = <&CPU_PWRDN>;
power-domains = <&CLUSTER_PD>;
};
CLUSTER_PD: cluster-pd {
#power-domain-cells = <0>;
domain-idle-states = <&CLUSTER_RET>, <&CLUSTER_PWRDN>;
};
};
... ...

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
%YAML 1.2 %YAML 1.2
--- ---
$id: http://devicetree.org/schemas/bindings/arm/qcom.yaml# $id: http://devicetree.org/schemas/arm/qcom.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml#
title: QCOM device tree bindings title: QCOM device tree bindings
@@ -24,28 +24,30 @@ description: |
The 'SoC' element must be one of the following strings: The 'SoC' element must be one of the following strings:
apq8016 apq8016
apq8074 apq8074
apq8084 apq8084
apq8096 apq8096
msm8916 ipq8074
msm8974 mdm9615
msm8992 msm8916
msm8994 msm8974
msm8996 msm8992
mdm9615 msm8994
ipq8074 msm8996
sdm845 sc7180
sdm845
The 'board' element must be one of the following strings: The 'board' element must be one of the following strings:
cdp cdp
liquid dragonboard
dragonboard hk01
mtp idp
sbc liquid
hk01 mtp
qrd qrd
sbc
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor> The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
where the minor number may be omitted when it's zero, i.e. v1.0 is the same where the minor number may be omitted when it's zero, i.e. v1.0 is the same
@@ -144,4 +146,8 @@ properties:
- qcom,ipq8074-hk01 - qcom,ipq8074-hk01
- const: qcom,ipq8074 - const: qcom,ipq8074
- items:
- enum:
- qcom,sc7180-idp
- const: qcom,sc7180
... ...

View File

@@ -409,6 +409,9 @@ properties:
- description: Pine64 RockPro64 - description: Pine64 RockPro64
items: items:
- enum:
- pine64,rockpro64-v2.1
- pine64,rockpro64-v2.0
- const: pine64,rockpro64 - const: pine64,rockpro64
- const: rockchip,rk3399 - const: rockchip,rk3399
@@ -422,6 +425,12 @@ properties:
- const: radxa,rockpi4 - const: radxa,rockpi4
- const: rockchip,rk3399 - const: rockchip,rk3399
- description: Radxa ROCK Pi N10
items:
- const: radxa,rockpi-n10
- const: vamrs,rk3399pro-vmarc-som
- const: rockchip,rk3399pro
- description: Radxa Rock2 Square - description: Radxa Rock2 Square
items: items:
- const: radxa,rock2-square - const: radxa,rock2-square

View File

@@ -2,7 +2,7 @@
# Copyright 2019 Unisoc Inc. # Copyright 2019 Unisoc Inc.
%YAML 1.2 %YAML 1.2
--- ---
$id: http://devicetree.org/schemas/arm/sprd.yaml# $id: http://devicetree.org/schemas/arm/sprd/sprd.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Unisoc platforms device tree bindings title: Unisoc platforms device tree bindings

View File

@@ -1,37 +0,0 @@
ML-AHB interconnect bindings
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
a Cortex-M subsystem with dedicated memories.
The MCU SRAM and RETRAM memory parts can be accessed through different addresses
(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
Cortex-M firmware accesses among those ports allows to tune the system
performance.
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
Required properties:
- compatible: should be "simple-bus"
- dma-ranges: describes memory addresses translation between the local CPU and
the remote Cortex-M processor. Each memory region, is declared with
3 parameters:
- param 1: device base address (Cortex-M processor address)
- param 2: physical base address (local CPU address)
- param 3: size of the memory region.
The Cortex-M remote processor accessed via the mlahb interconnect is described
by a child node.
Example:
mlahb {
compatible = "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
dma-ranges = <0x00000000 0x38000000 0x10000>,
<0x10000000 0x10000000 0x60000>,
<0x30000000 0x30000000 0x60000>;
m4_rproc: m4@10000000 {
...
};
};

View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: "http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
title: STMicroelectronics STM32 ML-AHB interconnect bindings
maintainers:
- Fabien Dessenne <fabien.dessenne@st.com>
- Arnaud Pouliquen <arnaud.pouliquen@st.com>
description: |
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
a Cortex-M subsystem with dedicated memories. The MCU SRAM and RETRAM memory
parts can be accessed through different addresses (see "RAM aliases" in [1])
using different buses (see [2]): balancing the Cortex-M firmware accesses
among those ports allows to tune the system performance.
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
allOf:
- $ref: /schemas/simple-bus.yaml#
properties:
compatible:
contains:
enum:
- st,mlahb
dma-ranges:
description: |
Describe memory addresses translation between the local CPU and the
remote Cortex-M processor. Each memory region, is declared with
3 parameters:
- param 1: device base address (Cortex-M processor address)
- param 2: physical base address (local CPU address)
- param 3: size of the memory region.
maxItems: 3
'#address-cells':
const: 1
'#size-cells':
const: 1
required:
- compatible
- '#address-cells'
- '#size-cells'
- dma-ranges
examples:
- |
mlahb: ahb {
compatible = "st,mlahb", "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
reg = <0x10000000 0x40000>;
ranges;
dma-ranges = <0x00000000 0x38000000 0x10000>,
<0x10000000 0x10000000 0x60000>,
<0x30000000 0x30000000 0x60000>;
m4_rproc: m4@10000000 {
reg = <0x10000000 0x40000>;
};
};
...

View File

@@ -0,0 +1,41 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: "http://devicetree.org/schemas/arm/stm32/st,stm32-syscon.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
title: STMicroelectronics STM32 Platforms System Controller bindings
maintainers:
- Alexandre Torgue <alexandre.torgue@st.com>
- Christophe Roullier <christophe.roullier@st.com>
properties:
compatible:
oneOf:
- items:
- enum:
- st,stm32mp157-syscfg
- const: syscon
reg:
maxItems: 1
clocks:
maxItems: 1
required:
- compatible
- reg
- clocks
examples:
- |
#include <dt-bindings/clock/stm32mp1-clks.h>
syscfg: syscon@50020000 {
compatible = "st,stm32mp157-syscfg", "syscon";
reg = <0x50020000 0x400>;
clocks = <&rcc SYSCFG>;
};
...

View File

@@ -1,16 +0,0 @@
STMicroelectronics STM32 Platforms System Controller
Properties:
- compatible : should contain two values. First value must be :
- " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
second value must be always "syscon".
- reg : offset and length of the register set.
- clocks: phandle to the syscfg clock
Example:
syscfg: syscon@50020000 {
compatible = "st,stm32mp157-syscfg", "syscon";
reg = <0x50020000 0x400>;
clocks = <&rcc SYSCFG>;
};

View File

@@ -342,6 +342,16 @@ properties:
- const: libretech,all-h3-cc-h5 - const: libretech,all-h3-cc-h5
- const: allwinner,sun50i-h5 - const: allwinner,sun50i-h5
- description: Libre Computer Board ALL-H3-IT H5
items:
- const: libretech,all-h3-it-h5
- const: allwinner,sun50i-h5
- description: Libre Computer Board ALL-H5-CC H5
items:
- const: libretech,all-h5-cc-h5
- const: allwinner,sun50i-h5
- description: Lichee Pi One - description: Lichee Pi One
items: items:
- const: licheepi,licheepi-one - const: licheepi,licheepi-one
@@ -470,6 +480,12 @@ properties:
- const: emlid,neutis-n5 - const: emlid,neutis-n5
- const: allwinner,sun50i-h5 - const: allwinner,sun50i-h5
- description: Emlid Neutis N5H3 Developper Board
items:
- const: emlid,neutis-n5h3-devboard
- const: emlid,neutis-n5h3
- const: allwinner,sun8i-h3
- description: NextThing Co. CHIP - description: NextThing Co. CHIP
items: items:
- const: nextthing,chip - const: nextthing,chip
@@ -599,11 +615,16 @@ properties:
- const: pine64,pine64-plus - const: pine64,pine64-plus
- const: allwinner,sun50i-a64 - const: allwinner,sun50i-a64
- description: Pine64 PineH64 - description: Pine64 PineH64 model A
items: items:
- const: pine64,pine-h64 - const: pine64,pine-h64
- const: allwinner,sun50i-h6 - const: allwinner,sun50i-h6
- description: Pine64 PineH64 model B
items:
- const: pine64,pine-h64-model-b
- const: allwinner,sun50i-h6
- description: Pine64 LTS - description: Pine64 LTS
items: items:
- const: pine64,pine64-lts - const: pine64,pine64-lts

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun4i-a10-mbus.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner Memory Bus (MBUS) controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
description: |
The MBUS controller drives the MBUS that other devices in the SoC
will use to perform DMA. It also has a register interface that
allows to monitor and control the bandwidth and priorities for
masters on that bus.
Each device having to perform their DMA through the MBUS must have
the interconnects and interconnect-names properties set to the MBUS
controller and with "dma-mem" as the interconnect name.
properties:
"#interconnect-cells":
const: 1
description:
The content of the cell is the MBUS ID.
compatible:
enum:
- allwinner,sun5i-a13-mbus
- allwinner,sun8i-h3-mbus
reg:
maxItems: 1
clocks:
maxItems: 1
dma-ranges:
description:
See section 2.3.9 of the DeviceTree Specification.
required:
- "#interconnect-cells"
- compatible
- reg
- clocks
- dma-ranges
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/sun5i-ccu.h>
mbus: dram-controller@1c01000 {
compatible = "allwinner,sun5i-a13-mbus";
reg = <0x01c01000 0x1000>;
clocks = <&ccu CLK_MBUS>;
dma-ranges = <0x00000000 0x40000000 0x20000000>;
#interconnect-cells = <1>;
};
...

View File

@@ -1,37 +0,0 @@
Allwinner Memory Bus (MBUS) controller
The MBUS controller drives the MBUS that other devices in the SoC will
use to perform DMA. It also has a register interface that allows to
monitor and control the bandwidth and priorities for masters on that
bus.
Required properties:
- compatible: Must be one of:
- allwinner,sun5i-a13-mbus
- allwinner,sun8i-h3-mbus
- reg: Offset and length of the register set for the controller
- clocks: phandle to the clock driving the controller
- dma-ranges: See section 2.3.9 of the DeviceTree Specification
- #interconnect-cells: Must be one, with the argument being the MBUS
port ID
Each device having to perform their DMA through the MBUS must have the
interconnects and interconnect-names properties set to the MBUS
controller and with "dma-mem" as the interconnect name.
Example:
mbus: dram-controller@1c01000 {
compatible = "allwinner,sun5i-a13-mbus";
reg = <0x01c01000 0x1000>;
clocks = <&ccu CLK_MBUS>;
dma-ranges = <0x00000000 0x40000000 0x20000000>;
#interconnect-cells = <1>;
};
fe0: display-frontend@1e00000 {
compatible = "allwinner,sun5i-a13-display-frontend";
...
interconnects = <&mbus 19>;
interconnect-names = "dma-mem";
};

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: GPL-2.0-only
%YAML 1.2
---
$id: http://devicetree.org/schemas/arm/ux500.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Ux500 platforms device tree bindings
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
properties:
$nodename:
const: '/'
compatible:
oneOf:
- description: ST-Ericsson HREF (pre-v60)
items:
- const: st-ericsson,mop500
- const: st-ericsson,u8500
- description: ST-Ericsson HREF (v60+)
items:
- const: st-ericsson,hrefv60+
- const: st-ericsson,u8500
- description: Calao Systems Snowball
items:
- const: calaosystems,snowball-a9500
- const: st-ericsson,u9500
- description: Samsung Galaxy S III mini (GT-I8190)
items:
- const: samsung,golden
- const: st-ericsson,u8500

View File

@@ -9,8 +9,6 @@ PHYs.
Required properties: Required properties:
- compatible : compatible string, one of: - compatible : compatible string, one of:
- "allwinner,sun4i-a10-ahci"
- "allwinner,sun8i-r40-ahci"
- "brcm,iproc-ahci" - "brcm,iproc-ahci"
- "hisilicon,hisi-ahci" - "hisilicon,hisi-ahci"
- "cavium,octeon-7130-ahci" - "cavium,octeon-7130-ahci"
@@ -45,8 +43,6 @@ Required properties when using sub-nodes:
- #address-cells : number of cells to encode an address - #address-cells : number of cells to encode an address
- #size-cells : number of cells representing the size of an address - #size-cells : number of cells representing the size of an address
For allwinner,sun8i-r40-ahci, the reset property must be present.
Sub-nodes required properties: Sub-nodes required properties:
- reg : the port number - reg : the port number
And at least one of the following properties: And at least one of the following properties:
@@ -60,14 +56,6 @@ Examples:
interrupts = <115>; interrupts = <115>;
}; };
ahci: sata@1c18000 {
compatible = "allwinner,sun4i-a10-ahci";
reg = <0x01c18000 0x1000>;
interrupts = <56>;
clocks = <&pll6 0>, <&ahb_gates 25>;
target-supply = <&reg_ahci_5v>;
};
With sub-nodes: With sub-nodes:
sata@f7e90000 { sata@f7e90000 {
compatible = "marvell,berlin2q-achi", "generic-ahci"; compatible = "marvell,berlin2q-achi", "generic-ahci";

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/ata/allwinner,sun4i-a10-ahci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 AHCI SATA Controller bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
properties:
compatible:
const: allwinner,sun4i-a10-ahci
reg:
maxItems: 1
clocks:
items:
- description: AHCI Bus Clock
- description: AHCI Module Clock
interrupts:
maxItems: 1
target-supply:
description: Regulator for SATA target power
required:
- compatible
- reg
- clocks
- interrupts
additionalProperties: false
examples:
- |
ahci: sata@1c18000 {
compatible = "allwinner,sun4i-a10-ahci";
reg = <0x01c18000 0x1000>;
interrupts = <56>;
clocks = <&pll6 0>, <&ahb_gates 25>;
target-supply = <&reg_ahci_5v>;
};

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/ata/allwinner,sun8i-r40-ahci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner R40 AHCI SATA Controller bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
properties:
compatible:
const: allwinner,sun8i-r40-ahci
reg:
maxItems: 1
clocks:
items:
- description: AHCI Bus Clock
- description: AHCI Module Clock
interrupts:
maxItems: 1
resets:
maxItems: 1
reset-names:
const: ahci
ahci-supply:
description: Regulator for the AHCI controller
phy-supply:
description: Regulator for the SATA PHY power
required:
- compatible
- reg
- clocks
- interrupts
- resets
- reset-names
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/clock/sun8i-r40-ccu.h>
#include <dt-bindings/reset/sun8i-r40-ccu.h>
ahci: sata@1c18000 {
compatible = "allwinner,sun8i-r40-ahci";
reg = <0x01c18000 0x1000>;
interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&ccu CLK_BUS_SATA>, <&ccu CLK_SATA>;
resets = <&ccu RST_BUS_SATA>;
reset-names = "ahci";
ahci-supply = <&reg_dldo4>;
phy-supply = <&reg_eldo3>;
};
...

View File

@@ -5,6 +5,7 @@ Each SATA controller should have its own node.
Required properties: Required properties:
- compatible : should be one or more of - compatible : should be one or more of
"brcm,bcm7216-ahci"
"brcm,bcm7425-ahci" "brcm,bcm7425-ahci"
"brcm,bcm7445-ahci" "brcm,bcm7445-ahci"
"brcm,bcm-nsp-ahci" "brcm,bcm-nsp-ahci"
@@ -14,6 +15,12 @@ Required properties:
- reg-names : "ahci" and "top-ctrl" - reg-names : "ahci" and "top-ctrl"
- interrupts : interrupt mapping for SATA IRQ - interrupts : interrupt mapping for SATA IRQ
Optional properties:
- reset: for "brcm,bcm7216-ahci" must be a valid reset phandle
pointing to the RESCAL reset controller provider node.
- reset-names: for "brcm,bcm7216-ahci", must be "rescal".
Also see ahci-platform.txt. Also see ahci-platform.txt.
Example: Example:

View File

@@ -1,38 +0,0 @@
* Faraday Technology FTIDE010 PATA controller
This controller is the first Faraday IDE interface block, used in the
StorLink SL2312 and SL3516, later known as the Cortina Systems Gemini
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
(MWDM)modes 0 through 2 and Ultra DMA modes 0 through 6.
On the Gemini platform, this PATA block is accompanied by a PATA to
SATA bridge in order to support SATA. This is why a phandle to that
controller is compulsory on that platform.
The timing properties are unique per-SoC, not per-board.
Required properties:
- compatible: should be one of
"cortina,gemini-pata", "faraday,ftide010"
"faraday,ftide010"
- interrupts: interrupt for the block
- reg: registers and size for the block
Optional properties:
- clocks: a SoC clock running the peripheral.
- clock-names: should be set to "PCLK" for the peripheral clock.
Required properties for "cortina,gemini-pata" compatible:
- sata: a phande to the Gemini PATA to SATA bridge, see
cortina,gemini-sata-bridge.txt for details.
Example:
ata@63000000 {
compatible = "cortina,gemini-pata", "faraday,ftide010";
reg = <0x63000000 0x100>;
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
clock-names = "PCLK";
sata = <&sata>;
};

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/ata/faraday,ftide010.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Faraday Technology FTIDE010 PATA controller
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
description: |
This controller is the first Faraday IDE interface block, used in the
StorLink SL3512 and SL3516, later known as the Cortina Systems Gemini
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
(MWDM) modes 0 through 2 and Ultra DMA modes 0 through 6.
On the Gemini platform, this PATA block is accompanied by a PATA to
SATA bridge in order to support SATA. This is why a phandle to that
controller is compulsory on that platform.
The timing properties are unique per-SoC, not per-board.
properties:
compatible:
oneOf:
- const: faraday,ftide010
- items:
- const: cortina,gemini-pata
- const: faraday,ftide010
reg:
maxItems: 1
interrupts:
maxItems: 1
clocks:
minItems: 1
clock-names:
const: PCLK
sata:
description:
phandle to the Gemini PATA to SATA bridge, if available
$ref: /schemas/types.yaml#/definitions/phandle
required:
- compatible
- reg
- interrupts
allOf:
- $ref: pata-common.yaml#
- if:
properties:
compatible:
contains:
const: cortina,gemini-pata
then:
required:
- sata
examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/clock/cortina,gemini-clock.h>
ide@63000000 {
compatible = "cortina,gemini-pata", "faraday,ftide010";
reg = <0x63000000 0x100>;
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
clock-names = "PCLK";
sata = <&sata>;
#address-cells = <1>;
#size-cells = <0>;
ide-port@0 {
reg = <0>;
};
ide-port@1 {
reg = <1>;
};
};
...

View File

@@ -0,0 +1,50 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/ata/pata-common.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Common Properties for Parallel AT attachment (PATA) controllers
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
description: |
This document defines device tree properties common to most Parallel
ATA (PATA, also known as IDE) AT attachment storage devices.
It doesn't constitue a device tree binding specification by itself but is
meant to be referenced by device tree bindings.
The PATA (IDE) controller-specific device tree bindings are responsible for
defining whether each property is required or optional.
properties:
$nodename:
pattern: "^ide(@.*)?$"
description:
Specifies the host controller node. PATA host controller nodes are named
"ide".
"#address-cells":
const: 1
"#size-cells":
const: 0
patternProperties:
"^ide-port@[0-1]$":
description: |
DT nodes for ports connected on the PATA host. The master drive will have
ID number 0 and the slave drive will have ID number 1. The PATA port
nodes will be named "ide-port".
type: object
properties:
reg:
minimum: 0
maximum: 1
description:
The ID number of the drive port, 0 for the master port and 1 for the
slave port.
...

View File

@@ -0,0 +1,50 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/ata/sata-common.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Common Properties for Serial AT attachment (SATA) controllers
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
description: |
This document defines device tree properties common to most Serial
AT attachment (SATA) storage devices. It doesn't constitute a device tree
binding specification by itself but is meant to be referenced by device
tree bindings.
The SATA controller-specific device tree bindings are responsible for
defining whether each property is required or optional.
properties:
$nodename:
pattern: "^sata(@.*)?$"
description:
Specifies the host controller node. SATA host controller nodes are named
"sata"
"#address-cells":
const: 1
"#size-cells":
const: 0
patternProperties:
"^sata-port@[0-9a-e]$":
description: |
DT nodes for ports connected on the SATA host. The SATA port
nodes will be named "sata-port".
type: object
properties:
reg:
minimum: 0
maximum: 14
description:
The ID number of the drive port SATA can potentially use a port
multiplier making it possible to connect up to 15 disks to a single
SATA port.
...

View File

@@ -0,0 +1,108 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ahb-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 AHB Clock Device Tree Bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
deprecated: true
properties:
"#clock-cells":
const: 0
compatible:
enum:
- allwinner,sun4i-a10-ahb-clk
- allwinner,sun6i-a31-ahb1-clk
- allwinner,sun8i-h3-ahb2-clk
reg:
maxItems: 1
clocks:
minItems: 1
maxItems: 4
description: >
The parent order must match the hardware programming order.
clock-output-names:
maxItems: 1
required:
- "#clock-cells"
- compatible
- reg
- clocks
- clock-output-names
additionalProperties: false
allOf:
- if:
properties:
compatible:
contains:
const: allwinner,sun4i-a10-ahb-clk
then:
properties:
clocks:
maxItems: 1
- if:
properties:
compatible:
contains:
const: allwinner,sun6i-a31-ahb1-clk
then:
properties:
clocks:
maxItems: 4
- if:
properties:
compatible:
contains:
const: allwinner,sun8i-h3-ahb2-clk
then:
properties:
clocks:
maxItems: 2
examples:
- |
ahb@1c20054 {
#clock-cells = <0>;
compatible = "allwinner,sun4i-a10-ahb-clk";
reg = <0x01c20054 0x4>;
clocks = <&axi>;
clock-output-names = "ahb";
};
- |
ahb1@1c20054 {
#clock-cells = <0>;
compatible = "allwinner,sun6i-a31-ahb1-clk";
reg = <0x01c20054 0x4>;
clocks = <&osc32k>, <&osc24M>, <&axi>, <&pll6 0>;
clock-output-names = "ahb1";
};
- |
ahb2_clk@1c2005c {
#clock-cells = <0>;
compatible = "allwinner,sun8i-h3-ahb2-clk";
reg = <0x01c2005c 0x4>;
clocks = <&ahb1>, <&pll6d2>;
clock-output-names = "ahb2";
};
...

View File

@@ -0,0 +1,50 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb0-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 APB0 Bus Clock Device Tree Bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
deprecated: true
properties:
"#clock-cells":
const: 0
compatible:
const: allwinner,sun4i-a10-apb0-clk
reg:
maxItems: 1
clocks:
maxItems: 1
clock-output-names:
maxItems: 1
required:
- "#clock-cells"
- compatible
- reg
- clocks
- clock-output-names
additionalProperties: false
examples:
- |
apb0@1c20054 {
#clock-cells = <0>;
compatible = "allwinner,sun4i-a10-apb0-clk";
reg = <0x01c20054 0x4>;
clocks = <&ahb>;
clock-output-names = "apb0";
};
...

View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb1-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 APB1 Bus Clock Device Tree Bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
deprecated: true
properties:
"#clock-cells":
const: 0
compatible:
const: allwinner,sun4i-a10-apb1-clk
reg:
maxItems: 1
clocks:
maxItems: 3
description: >
The parent order must match the hardware programming order.
clock-output-names:
maxItems: 1
required:
- "#clock-cells"
- compatible
- reg
- clocks
- clock-output-names
additionalProperties: false
examples:
- |
clk@1c20058 {
#clock-cells = <0>;
compatible = "allwinner,sun4i-a10-apb1-clk";
reg = <0x01c20058 0x4>;
clocks = <&osc24M>, <&pll6 1>, <&osc32k>;
clock-output-names = "apb1";
};
...

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-axi-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 AXI Clock Device Tree Bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
deprecated: true
properties:
"#clock-cells":
const: 0
compatible:
enum:
- allwinner,sun4i-a10-axi-clk
- allwinner,sun8i-a23-axi-clk
reg:
maxItems: 1
clocks:
maxItems: 1
clock-output-names:
maxItems: 1
required:
- "#clock-cells"
- compatible
- reg
- clocks
- clock-output-names
additionalProperties: false
examples:
- |
axi@1c20054 {
#clock-cells = <0>;
compatible = "allwinner,sun4i-a10-axi-clk";
reg = <0x01c20054 0x4>;
clocks = <&cpu>;
clock-output-names = "axi";
};
- |
axi_clk@1c20050 {
#clock-cells = <0>;
compatible = "allwinner,sun8i-a23-axi-clk";
reg = <0x01c20050 0x4>;
clocks = <&cpu>;
clock-output-names = "axi";
};
...

View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-cpu-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Allwinner A10 CPU Clock Device Tree Bindings
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
deprecated: true
properties:
"#clock-cells":
const: 0
compatible:
const: allwinner,sun4i-a10-cpu-clk
reg:
maxItems: 1
clocks:
maxItems: 4
description: >
The parent order must match the hardware programming order.
clock-output-names:
maxItems: 1
required:
- "#clock-cells"
- compatible
- reg
- clocks
- clock-output-names
additionalProperties: false
examples:
- |
cpu@1c20054 {
#clock-cells = <0>;
compatible = "allwinner,sun4i-a10-cpu-clk";
reg = <0x01c20054 0x4>;
clocks = <&osc32k>, <&osc24M>, <&pll1>, <&dummy>;
clock-output-names = "cpu";
};
...

Some files were not shown because too many files have changed in this diff Show More