mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 12:43:29 +02:00
Merge branch 'for-5.7/appleir' into for-linus
- small code cleanups in hid-appleir from Lucas Tanure
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -100,6 +100,10 @@ modules.order
|
|||||||
/include/ksym/
|
/include/ksym/
|
||||||
/arch/*/include/generated/
|
/arch/*/include/generated/
|
||||||
|
|
||||||
|
# Generated lkdtm tests
|
||||||
|
/tools/testing/selftests/lkdtm/*.sh
|
||||||
|
!/tools/testing/selftests/lkdtm/run.sh
|
||||||
|
|
||||||
# stgit generated dirs
|
# stgit generated dirs
|
||||||
patches-*
|
patches-*
|
||||||
|
|
||||||
|
11
.mailmap
11
.mailmap
@@ -18,6 +18,7 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
|||||||
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
||||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
||||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
||||||
|
Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com>
|
||||||
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||||
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||||
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||||
@@ -27,6 +28,8 @@ Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
|||||||
Andreas Herrmann <aherrman@de.ibm.com>
|
Andreas Herrmann <aherrman@de.ibm.com>
|
||||||
Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
|
Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
|
||||||
Andrew Morton <akpm@linux-foundation.org>
|
Andrew Morton <akpm@linux-foundation.org>
|
||||||
|
Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
|
||||||
|
Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
|
||||||
Andrew Vasquez <andrew.vasquez@qlogic.com>
|
Andrew Vasquez <andrew.vasquez@qlogic.com>
|
||||||
Andy Adamson <andros@citi.umich.edu>
|
Andy Adamson <andros@citi.umich.edu>
|
||||||
Antoine Tenart <antoine.tenart@free-electrons.com>
|
Antoine Tenart <antoine.tenart@free-electrons.com>
|
||||||
@@ -74,6 +77,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> <dima@arista.com>
|
|||||||
Domen Puncer <domen@coderock.org>
|
Domen Puncer <domen@coderock.org>
|
||||||
Douglas Gilbert <dougg@torque.net>
|
Douglas Gilbert <dougg@torque.net>
|
||||||
Ed L. Cashin <ecashin@coraid.com>
|
Ed L. Cashin <ecashin@coraid.com>
|
||||||
|
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
|
||||||
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
|
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
|
||||||
Felipe W Damasio <felipewd@terra.com.br>
|
Felipe W Damasio <felipewd@terra.com.br>
|
||||||
Felix Kuhling <fxkuehl@gmx.de>
|
Felix Kuhling <fxkuehl@gmx.de>
|
||||||
@@ -138,6 +142,7 @@ Juha Yrjola <at solidboot.com>
|
|||||||
Juha Yrjola <juha.yrjola@nokia.com>
|
Juha Yrjola <juha.yrjola@nokia.com>
|
||||||
Juha Yrjola <juha.yrjola@solidboot.com>
|
Juha Yrjola <juha.yrjola@solidboot.com>
|
||||||
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
||||||
|
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
|
||||||
Kay Sievers <kay.sievers@vrfy.org>
|
Kay Sievers <kay.sievers@vrfy.org>
|
||||||
Kenneth W Chen <kenneth.w.chen@intel.com>
|
Kenneth W Chen <kenneth.w.chen@intel.com>
|
||||||
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
|
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
|
||||||
@@ -209,6 +214,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
|||||||
Patrick Mochel <mochel@digitalimplant.org>
|
Patrick Mochel <mochel@digitalimplant.org>
|
||||||
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
||||||
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
|
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
|
||||||
|
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
|
||||||
|
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
|
||||||
|
Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
|
||||||
|
Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
|
||||||
Peter A Jonsson <pj@ludd.ltu.se>
|
Peter A Jonsson <pj@ludd.ltu.se>
|
||||||
Peter Oruba <peter@oruba.de>
|
Peter Oruba <peter@oruba.de>
|
||||||
Peter Oruba <peter.oruba@amd.com>
|
Peter Oruba <peter.oruba@amd.com>
|
||||||
@@ -217,6 +226,7 @@ Praveen BP <praveenbp@ti.com>
|
|||||||
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
||||||
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
||||||
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
||||||
|
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
|
||||||
Rajesh Shah <rajesh.shah@intel.com>
|
Rajesh Shah <rajesh.shah@intel.com>
|
||||||
Ralf Baechle <ralf@linux-mips.org>
|
Ralf Baechle <ralf@linux-mips.org>
|
||||||
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
|
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
|
||||||
@@ -252,6 +262,7 @@ Sumit Semwal <sumit.semwal@ti.com>
|
|||||||
Tejun Heo <htejun@gmail.com>
|
Tejun Heo <htejun@gmail.com>
|
||||||
Thomas Graf <tgraf@suug.ch>
|
Thomas Graf <tgraf@suug.ch>
|
||||||
Thomas Pedersen <twp@codeaurora.org>
|
Thomas Pedersen <twp@codeaurora.org>
|
||||||
|
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
|
||||||
Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org>
|
Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org>
|
||||||
Tony Luck <tony.luck@intel.com>
|
Tony Luck <tony.luck@intel.com>
|
||||||
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
||||||
|
2
COPYING
2
COPYING
@@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see:
|
|||||||
Documentation/process/license-rules.rst
|
Documentation/process/license-rules.rst
|
||||||
|
|
||||||
for more details.
|
for more details.
|
||||||
|
|
||||||
|
All contributions to the Linux Kernel are subject to this COPYING file.
|
||||||
|
9
CREDITS
9
CREDITS
@@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem
|
|||||||
S: Orlando, Florida
|
S: Orlando, Florida
|
||||||
S: USA
|
S: USA
|
||||||
|
|
||||||
|
N: Paul Burton
|
||||||
|
E: paulburton@kernel.org
|
||||||
|
W: https://pburton.com
|
||||||
|
D: MIPS maintainer 2018-2020
|
||||||
|
|
||||||
N: Lennert Buytenhek
|
N: Lennert Buytenhek
|
||||||
E: kernel@wantstofly.org
|
E: kernel@wantstofly.org
|
||||||
D: Original (2.4) rewrite of the ethernet bridging code
|
D: Original (2.4) rewrite of the ethernet bridging code
|
||||||
@@ -3302,7 +3307,9 @@ S: France
|
|||||||
N: Aleksa Sarai
|
N: Aleksa Sarai
|
||||||
E: cyphar@cyphar.com
|
E: cyphar@cyphar.com
|
||||||
W: https://www.cyphar.com/
|
W: https://www.cyphar.com/
|
||||||
D: `pids` cgroup subsystem
|
D: /sys/fs/cgroup/pids
|
||||||
|
D: openat2(2)
|
||||||
|
S: Sydney, Australia
|
||||||
|
|
||||||
N: Dipankar Sarma
|
N: Dipankar Sarma
|
||||||
E: dipankar@in.ibm.com
|
E: dipankar@in.ibm.com
|
||||||
|
26
Documentation/ABI/obsolete/sysfs-selinux-disable
Normal file
26
Documentation/ABI/obsolete/sysfs-selinux-disable
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
What: /sys/fs/selinux/disable
|
||||||
|
Date: April 2005 (predates git)
|
||||||
|
KernelVersion: 2.6.12-rc2 (predates git)
|
||||||
|
Contact: selinux@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
|
||||||
|
The selinuxfs "disable" node allows SELinux to be disabled at runtime
|
||||||
|
prior to a policy being loaded into the kernel. If disabled via this
|
||||||
|
mechanism, SELinux will remain disabled until the system is rebooted.
|
||||||
|
|
||||||
|
The preferred method of disabling SELinux is via the "selinux=0" boot
|
||||||
|
parameter, but the selinuxfs "disable" node was created to make it
|
||||||
|
easier for systems with primitive bootloaders that did not allow for
|
||||||
|
easy modification of the kernel command line. Unfortunately, allowing
|
||||||
|
for SELinux to be disabled at runtime makes it difficult to secure the
|
||||||
|
kernel's LSM hooks using the "__ro_after_init" feature.
|
||||||
|
|
||||||
|
Thankfully, the need for the SELinux runtime disable appears to be
|
||||||
|
gone, the default Kconfig configuration disables this selinuxfs node,
|
||||||
|
and only one of the major distributions, Fedora, supports disabling
|
||||||
|
SELinux at runtime. Fedora is in the process of removing the
|
||||||
|
selinuxfs "disable" node and once that is complete we will start the
|
||||||
|
slow process of removing this code from the kernel.
|
||||||
|
|
||||||
|
More information on /sys/fs/selinux/disable can be found under the
|
||||||
|
CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.
|
171
Documentation/ABI/stable/sysfs-driver-dma-idxd
Normal file
171
Documentation/ABI/stable/sysfs-driver-dma-idxd
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
What: sys/bus/dsa/devices/dsa<m>/cdev_major
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The major number that the character device driver assigned to
|
||||||
|
this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/errors
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The error information for this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_batch_size
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The largest number of work descriptors in a batch.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_work_queues_size
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The maximum work queue size supported by this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_engines
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The maximum number of engines supported by this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_groups
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The maximum number of groups can be created under this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_tokens
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The total number of bandwidth tokens supported by this device.
|
||||||
|
The bandwidth tokens represent resources within the DSA
|
||||||
|
implementation, and these resources are allocated by engines to
|
||||||
|
support operations.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_transfer_size
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The number of bytes to be read from the source address to
|
||||||
|
perform the operation. The maximum transfer size is dependent on
|
||||||
|
the workqueue the descriptor was submitted to.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/max_work_queues
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The maximum work queue number that this device supports.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/numa_node
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The numa node number for this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/op_cap
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The operation capability bit mask specify the operation types
|
||||||
|
supported by the this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/state
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The state information of this device. It can be either enabled
|
||||||
|
or disabled.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/group<m>.<n>
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The assigned group under this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The assigned engine under this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The assigned work queue under this device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/configurable
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: To indicate if this device is configurable or not.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/dsa<m>/token_limit
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The maximum number of bandwidth tokens that may be in use at
|
||||||
|
one time by operations that access low bandwidth memory in the
|
||||||
|
device.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/group_id
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The group id that this work queue belongs to.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/size
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The work queue size for this work queue.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/type
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The type of this work queue, it can be "kernel" type for work
|
||||||
|
queue usages in the kernel space or "user" type for work queue
|
||||||
|
usages by applications in user space.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The minor number assigned to this work queue by the character
|
||||||
|
device driver.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/mode
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The work queue mode type for this work queue.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/priority
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The priority value of this work queue, it is a vlue relative to
|
||||||
|
other work queue in the same group to control quality of service
|
||||||
|
for dispatching work from multiple workqueues in the same group.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/state
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The current state of the work queue.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/wq<m>.<n>/threshold
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The number of entries in this work queue that may be filled
|
||||||
|
via a limited portal.
|
||||||
|
|
||||||
|
What: sys/bus/dsa/devices/engine<m>.<n>/group_id
|
||||||
|
Date: Oct 25, 2019
|
||||||
|
KernelVersion: 5.6.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: The group that this engine belongs to.
|
@@ -16,6 +16,10 @@ Description:
|
|||||||
write UDC's name found in /sys/class/udc/*
|
write UDC's name found in /sys/class/udc/*
|
||||||
to bind a gadget, empty string "" to unbind.
|
to bind a gadget, empty string "" to unbind.
|
||||||
|
|
||||||
|
max_speed - maximum speed the driver supports. Valid
|
||||||
|
names are super-speed-plus, super-speed,
|
||||||
|
high-speed, full-speed, and low-speed.
|
||||||
|
|
||||||
bDeviceClass - USB device class code
|
bDeviceClass - USB device class code
|
||||||
bDeviceSubClass - USB device subclass code
|
bDeviceSubClass - USB device subclass code
|
||||||
bDeviceProtocol - USB device protocol code
|
bDeviceProtocol - USB device protocol code
|
||||||
|
@@ -25,11 +25,11 @@ Description:
|
|||||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||||
[obj_user=] [obj_role=] [obj_type=]]
|
[obj_user=] [obj_role=] [obj_type=]]
|
||||||
option: [[appraise_type=]] [template=] [permit_directio]
|
option: [[appraise_type=]] [template=] [permit_directio]
|
||||||
[appraise_flag=]
|
[appraise_flag=] [keyrings=]
|
||||||
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
||||||
[FIRMWARE_CHECK]
|
[FIRMWARE_CHECK]
|
||||||
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
||||||
[KEXEC_CMDLINE]
|
[KEXEC_CMDLINE] [KEY_CHECK]
|
||||||
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
|
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
|
||||||
[[^]MAY_EXEC]
|
[[^]MAY_EXEC]
|
||||||
fsmagic:= hex value
|
fsmagic:= hex value
|
||||||
@@ -42,6 +42,9 @@ Description:
|
|||||||
appraise_flag:= [check_blacklist]
|
appraise_flag:= [check_blacklist]
|
||||||
Currently, blacklist check is only for files signed with appended
|
Currently, blacklist check is only for files signed with appended
|
||||||
signature.
|
signature.
|
||||||
|
keyrings:= list of keyrings
|
||||||
|
(eg, .builtin_trusted_keys|.ima). Only valid
|
||||||
|
when action is "measure" and func is KEY_CHECK.
|
||||||
template:= name of a defined IMA template type
|
template:= name of a defined IMA template type
|
||||||
(eg, ima-ng). Only valid when action is "measure".
|
(eg, ima-ng). Only valid when action is "measure".
|
||||||
pcr:= decimal value
|
pcr:= decimal value
|
||||||
@@ -113,3 +116,12 @@ Description:
|
|||||||
Example of appraise rule allowing modsig appended signatures:
|
Example of appraise rule allowing modsig appended signatures:
|
||||||
|
|
||||||
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
|
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
|
||||||
|
|
||||||
|
Example of measure rule using KEY_CHECK to measure all keys:
|
||||||
|
|
||||||
|
measure func=KEY_CHECK
|
||||||
|
|
||||||
|
Example of measure rule using KEY_CHECK to only measure
|
||||||
|
keys added to .builtin_trusted_keys or .ima keyring:
|
||||||
|
|
||||||
|
measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
|
||||||
|
@@ -33,6 +33,14 @@ Description:
|
|||||||
Requires a separate RTC_PIE_ON call to enable the periodic
|
Requires a separate RTC_PIE_ON call to enable the periodic
|
||||||
interrupts.
|
interrupts.
|
||||||
|
|
||||||
|
* RTC_VL_READ: Read the voltage inputs status of the RTC when
|
||||||
|
supported. The value is a bit field of RTC_VL_*, giving the
|
||||||
|
status of the main and backup voltages.
|
||||||
|
|
||||||
|
* RTC_VL_CLEAR: Clear the voltage status of the RTC. Some RTCs
|
||||||
|
need user interaction when the backup power provider is
|
||||||
|
replaced or charged to be able to clear the status.
|
||||||
|
|
||||||
The ioctl() calls supported by the older /dev/rtc interface are
|
The ioctl() calls supported by the older /dev/rtc interface are
|
||||||
also supported by the newer RTC class framework. However,
|
also supported by the newer RTC class framework. However,
|
||||||
because the chips and systems are not standardized, some PC/AT
|
because the chips and systems are not standardized, some PC/AT
|
||||||
|
@@ -1726,3 +1726,16 @@ Contact: linux-iio@vger.kernel.org
|
|||||||
Description:
|
Description:
|
||||||
List of valid periods (in seconds) for which the light intensity
|
List of valid periods (in seconds) for which the light intensity
|
||||||
must be above the threshold level before interrupt is asserted.
|
must be above the threshold level before interrupt is asserted.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_filter_notch_center_frequency
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Center frequency in Hz for a notch filter. Used i.e. for line
|
||||||
|
noise suppression.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_temp_thermocouple_type
|
||||||
|
KernelVersion: 5.5
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
One of the following thermocouple types: B, E, J, K, N, R, S, T.
|
||||||
|
19
Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
Normal file
19
Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/buffer/length_align_bytes
|
||||||
|
KernelVersion: 5.4
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
DMA buffers tend to have a alignment requirement for the
|
||||||
|
buffers. If this alignment requirement is not met samples might
|
||||||
|
be dropped from the buffer.
|
||||||
|
|
||||||
|
This property reports the alignment requirements in bytes.
|
||||||
|
This means that the buffer size in bytes needs to be a integer
|
||||||
|
multiple of the number reported by this file.
|
||||||
|
|
||||||
|
The alignment requirements in number of sample sets will depend
|
||||||
|
on the enabled channels and the bytes per channel. This means
|
||||||
|
that the alignment requirement in samples sets might change
|
||||||
|
depending on which and how many channels are enabled. Whereas
|
||||||
|
the alignment requirement reported in bytes by this property
|
||||||
|
will remain static and does not depend on which channels are
|
||||||
|
enabled.
|
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
This folder contains statistics about global and per
|
||||||
|
MDIO bus address statistics.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/transfers
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of transfers for this MDIO bus.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/errors
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of transfer errors for this MDIO bus.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/writes
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of write transactions for this MDIO bus.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/reads
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of read transactions for this MDIO bus.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of transfers for this MDIO bus address.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of transfer errors for this MDIO bus address.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of write transactions for this MDIO bus address.
|
||||||
|
|
||||||
|
What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
|
||||||
|
Date: January 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Total number of read transactions for this MDIO bus address.
|
@@ -7,6 +7,13 @@ Description:
|
|||||||
The name of devfreq object denoted as ... is same as the
|
The name of devfreq object denoted as ... is same as the
|
||||||
name of device using devfreq.
|
name of device using devfreq.
|
||||||
|
|
||||||
|
What: /sys/class/devfreq/.../name
|
||||||
|
Date: November 2019
|
||||||
|
Contact: Chanwoo Choi <cw00.choi@samsung.com>
|
||||||
|
Description:
|
||||||
|
The /sys/class/devfreq/.../name shows the name of device
|
||||||
|
of the corresponding devfreq object.
|
||||||
|
|
||||||
What: /sys/class/devfreq/.../governor
|
What: /sys/class/devfreq/.../governor
|
||||||
Date: September 2011
|
Date: September 2011
|
||||||
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
||||||
@@ -48,12 +55,15 @@ What: /sys/class/devfreq/.../trans_stat
|
|||||||
Date: October 2012
|
Date: October 2012
|
||||||
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
||||||
Description:
|
Description:
|
||||||
This ABI shows the statistics of devfreq behavior on a
|
This ABI shows or clears the statistics of devfreq behavior
|
||||||
specific device. It shows the time spent in each state and
|
on a specific device. It shows the time spent in each state
|
||||||
the number of transitions between states.
|
and the number of transitions between states.
|
||||||
In order to activate this ABI, the devfreq target device
|
In order to activate this ABI, the devfreq target device
|
||||||
driver should provide the list of available frequencies
|
driver should provide the list of available frequencies
|
||||||
with its profile.
|
with its profile. If need to reset the statistics of devfreq
|
||||||
|
behavior on a specific device, enter 0(zero) to 'trans_stat'
|
||||||
|
as following:
|
||||||
|
echo 0 > /sys/class/devfreq/.../trans_stat
|
||||||
|
|
||||||
What: /sys/class/devfreq/.../userspace/set_freq
|
What: /sys/class/devfreq/.../userspace/set_freq
|
||||||
Date: September 2011
|
Date: September 2011
|
||||||
|
@@ -189,7 +189,8 @@ Description:
|
|||||||
Access: Read
|
Access: Read
|
||||||
Valid values: "Unknown", "Good", "Overheat", "Dead",
|
Valid values: "Unknown", "Good", "Overheat", "Dead",
|
||||||
"Over voltage", "Unspecified failure", "Cold",
|
"Over voltage", "Unspecified failure", "Cold",
|
||||||
"Watchdog timer expire", "Safety timer expire"
|
"Watchdog timer expire", "Safety timer expire",
|
||||||
|
"Over current"
|
||||||
|
|
||||||
What: /sys/class/power_supply/<supply_name>/precharge_current
|
What: /sys/class/power_supply/<supply_name>/precharge_current
|
||||||
Date: June 2017
|
Date: June 2017
|
||||||
|
@@ -196,6 +196,12 @@ Description:
|
|||||||
does not reflect it. Likewise, if one enables a deep state but a
|
does not reflect it. Likewise, if one enables a deep state but a
|
||||||
lighter state still is disabled, then this has no effect.
|
lighter state still is disabled, then this has no effect.
|
||||||
|
|
||||||
|
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: v5.6
|
||||||
|
Contact: Linux power management list <linux-pm@vger.kernel.org>
|
||||||
|
Description:
|
||||||
|
(RO) The default status of this state, "enabled" or "disabled".
|
||||||
|
|
||||||
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
|
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
|
||||||
Date: March 2014
|
Date: March 2014
|
||||||
|
@@ -11,3 +11,16 @@ Description:
|
|||||||
#echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks
|
#echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks
|
||||||
will allow the guest to read and write to the configuration
|
will allow the guest to read and write to the configuration
|
||||||
register 0x0E.
|
register 0x0E.
|
||||||
|
|
||||||
|
What: /sys/bus/pci/drivers/pciback/allow_interrupt_control
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: xen-devel@lists.xenproject.org
|
||||||
|
Description:
|
||||||
|
List of devices which can have interrupt control flag (INTx,
|
||||||
|
MSI, MSI-X) set by a connected guest. It is meant to be set
|
||||||
|
only when the guest is a stubdomain hosting device model (qemu)
|
||||||
|
and the actual device is assigned to a HVM. It is not safe
|
||||||
|
(similar to permissive attribute) to set for a devices assigned
|
||||||
|
to a PV guest. The device is automatically removed from this
|
||||||
|
list when the connected pcifront terminates.
|
||||||
|
@@ -25,3 +25,13 @@ Description:
|
|||||||
allocated without being in use. The time is in
|
allocated without being in use. The time is in
|
||||||
seconds, 0 means indefinitely long.
|
seconds, 0 means indefinitely long.
|
||||||
The default is 60 seconds.
|
The default is 60 seconds.
|
||||||
|
|
||||||
|
What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
|
||||||
|
Date: December 2019
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: SeongJae Park <sjpark@amazon.de>
|
||||||
|
Description:
|
||||||
|
When memory pressure is reported to blkback this option
|
||||||
|
controls the duration in milliseconds that blkback will not
|
||||||
|
cache any page not backed by a grant mapping.
|
||||||
|
The default is 10ms.
|
||||||
|
@@ -1,37 +1,40 @@
|
|||||||
What: /sys/fs/f2fs/<disk>/gc_max_sleep_time
|
What: /sys/fs/f2fs/<disk>/gc_max_sleep_time
|
||||||
Date: July 2013
|
Date: July 2013
|
||||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||||
Description:
|
Description: Controls the maximum sleep time for gc_thread. Time
|
||||||
Controls the maximun sleep time for gc_thread. Time
|
is in milliseconds.
|
||||||
is in milliseconds.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_min_sleep_time
|
What: /sys/fs/f2fs/<disk>/gc_min_sleep_time
|
||||||
Date: July 2013
|
Date: July 2013
|
||||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||||
Description:
|
Description: Controls the minimum sleep time for gc_thread. Time
|
||||||
Controls the minimum sleep time for gc_thread. Time
|
is in milliseconds.
|
||||||
is in milliseconds.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time
|
What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time
|
||||||
Date: July 2013
|
Date: July 2013
|
||||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||||
Description:
|
Description: Controls the default sleep time for gc_thread. Time
|
||||||
Controls the default sleep time for gc_thread. Time
|
is in milliseconds.
|
||||||
is in milliseconds.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_idle
|
What: /sys/fs/f2fs/<disk>/gc_idle
|
||||||
Date: July 2013
|
Date: July 2013
|
||||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||||
Description:
|
Description: Controls the victim selection policy for garbage collection.
|
||||||
Controls the victim selection policy for garbage collection.
|
Setting gc_idle = 0(default) will disable this option. Setting
|
||||||
|
gc_idle = 1 will select the Cost Benefit approach & setting
|
||||||
|
gc_idle = 2 will select the greedy approach.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/reclaim_segments
|
What: /sys/fs/f2fs/<disk>/reclaim_segments
|
||||||
Date: October 2013
|
Date: October 2013
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: This parameter controls the number of prefree segments to be
|
||||||
Controls the issue rate of segment discard commands.
|
reclaimed. If the number of prefree segments is larger than
|
||||||
|
the number of segments in the proportion to the percentage
|
||||||
|
over total volume size, f2fs tries to conduct checkpoint to
|
||||||
|
reclaim the prefree segments to free segments.
|
||||||
|
By default, 5% over total # of segments.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/max_blkaddr
|
What: /sys/fs/f2fs/<disk>/main_blkaddr
|
||||||
Date: November 2019
|
Date: November 2019
|
||||||
Contact: "Ramon Pantin" <pantin@google.com>
|
Contact: "Ramon Pantin" <pantin@google.com>
|
||||||
Description:
|
Description:
|
||||||
@@ -40,227 +43,278 @@ Description:
|
|||||||
What: /sys/fs/f2fs/<disk>/ipu_policy
|
What: /sys/fs/f2fs/<disk>/ipu_policy
|
||||||
Date: November 2013
|
Date: November 2013
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the in-place-update policy.
|
||||||
Controls the in-place-update policy.
|
updates in f2fs. User can set:
|
||||||
|
0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
|
||||||
|
0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
|
||||||
|
0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
|
||||||
|
0x40: F2FS_IPU_NOCACHE.
|
||||||
|
Refer segment.h for details.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/min_ipu_util
|
What: /sys/fs/f2fs/<disk>/min_ipu_util
|
||||||
Date: November 2013
|
Date: November 2013
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the FS utilization condition for the in-place-update
|
||||||
Controls the FS utilization condition for the in-place-update
|
policies. It is used by F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
|
||||||
policies.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/min_fsync_blocks
|
What: /sys/fs/f2fs/<disk>/min_fsync_blocks
|
||||||
Date: September 2014
|
Date: September 2014
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the dirty page count condition for the in-place-update
|
||||||
Controls the dirty page count condition for the in-place-update
|
policies.
|
||||||
policies.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/min_seq_blocks
|
What: /sys/fs/f2fs/<disk>/min_seq_blocks
|
||||||
Date: August 2018
|
Date: August 2018
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the dirty page count condition for batched sequential
|
||||||
Controls the dirty page count condition for batched sequential
|
writes in writepages.
|
||||||
writes in ->writepages.
|
|
||||||
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/min_hot_blocks
|
What: /sys/fs/f2fs/<disk>/min_hot_blocks
|
||||||
Date: March 2017
|
Date: March 2017
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the dirty page count condition for redefining hot data.
|
||||||
Controls the dirty page count condition for redefining hot data.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/min_ssr_sections
|
What: /sys/fs/f2fs/<disk>/min_ssr_sections
|
||||||
Date: October 2017
|
Date: October 2017
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Controls the free section threshold to trigger SSR allocation.
|
||||||
Controls the fee section threshold to trigger SSR allocation.
|
If this is large, SSR mode will be enabled early.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/max_small_discards
|
What: /sys/fs/f2fs/<disk>/max_small_discards
|
||||||
Date: November 2013
|
Date: November 2013
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the issue rate of discard commands that consist of small
|
||||||
Controls the issue rate of small discard commands.
|
blocks less than 2MB. The candidates to be discarded are cached until
|
||||||
|
checkpoint is triggered, and issued during the checkpoint.
|
||||||
|
By default, it is disabled with 0.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/discard_granularity
|
What: /sys/fs/f2fs/<disk>/discard_granularity
|
||||||
Date: July 2017
|
Date: July 2017
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Controls discard granularity of inner discard thread. Inner thread
|
||||||
Controls discard granularity of inner discard thread, inner thread
|
|
||||||
will not issue discards with size that is smaller than granularity.
|
will not issue discards with size that is smaller than granularity.
|
||||||
The unit size is one block, now only support configuring in range
|
The unit size is one block(4KB), now only support configuring
|
||||||
of [1, 512].
|
in range of [1, 512]. Default value is 4(=16KB).
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||||
Date: January 2019
|
Date: January 2019
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Set timeout to issue discard commands during umount.
|
||||||
Set timeout to issue discard commands during umount.
|
Default: 5 secs
|
||||||
Default: 5 secs
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||||
Date: January 2014
|
Date: January 2014
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the number of trials to find a victim segment
|
||||||
Controls the number of trials to find a victim segment.
|
when conducting SSR and cleaning operations. The default value
|
||||||
|
is 4096 which covers 8GB block address range.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/migration_granularity
|
What: /sys/fs/f2fs/<disk>/migration_granularity
|
||||||
Date: October 2018
|
Date: October 2018
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Controls migration granularity of garbage collection on large
|
||||||
Controls migration granularity of garbage collection on large
|
section, it can let GC move partial segment{s} of one section
|
||||||
section, it can let GC move partial segment{s} of one section
|
in one GC cycle, so that dispersing heavy overhead GC to
|
||||||
in one GC cycle, so that dispersing heavy overhead GC to
|
multiple lightweight one.
|
||||||
multiple lightweight one.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/dir_level
|
What: /sys/fs/f2fs/<disk>/dir_level
|
||||||
Date: March 2014
|
Date: March 2014
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the directory level for large directory. If a
|
||||||
Controls the directory level for large directory.
|
directory has a number of files, it can reduce the file lookup
|
||||||
|
latency by increasing this dir_level value. Otherwise, it
|
||||||
|
needs to decrease this value to reduce the space overhead.
|
||||||
|
The default value is 0.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/ram_thresh
|
What: /sys/fs/f2fs/<disk>/ram_thresh
|
||||||
Date: March 2014
|
Date: March 2014
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||||
Description:
|
Description: Controls the memory footprint used by free nids and cached
|
||||||
Controls the memory footprint used by f2fs.
|
nat entries. By default, 1 is set, which indicates
|
||||||
|
10 MB / 1 GB RAM.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/batched_trim_sections
|
What: /sys/fs/f2fs/<disk>/batched_trim_sections
|
||||||
Date: February 2015
|
Date: February 2015
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the trimming rate in batch mode.
|
||||||
Controls the trimming rate in batch mode.
|
<deprecated>
|
||||||
<deprecated>
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/cp_interval
|
What: /sys/fs/f2fs/<disk>/cp_interval
|
||||||
Date: October 2015
|
Date: October 2015
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the checkpoint timing, set to 60 seconds by default.
|
||||||
Controls the checkpoint timing.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/idle_interval
|
What: /sys/fs/f2fs/<disk>/idle_interval
|
||||||
Date: January 2016
|
Date: January 2016
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls the idle timing of system, if there is no FS operation
|
||||||
Controls the idle timing for all paths other than
|
during given interval.
|
||||||
discard and gc path.
|
Set to 5 seconds by default.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/discard_idle_interval
|
What: /sys/fs/f2fs/<disk>/discard_idle_interval
|
||||||
Date: September 2018
|
Date: September 2018
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||||
Description:
|
Description: Controls the idle timing of discard thread given
|
||||||
Controls the idle timing for discard path.
|
this time interval.
|
||||||
|
Default is 5 secs.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_idle_interval
|
What: /sys/fs/f2fs/<disk>/gc_idle_interval
|
||||||
Date: September 2018
|
Date: September 2018
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||||
Description:
|
Description: Controls the idle timing for gc path. Set to 5 seconds by default.
|
||||||
Controls the idle timing for gc path.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/iostat_enable
|
What: /sys/fs/f2fs/<disk>/iostat_enable
|
||||||
Date: August 2017
|
Date: August 2017
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Controls to enable/disable IO stat.
|
||||||
Controls to enable/disable IO stat.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/ra_nid_pages
|
What: /sys/fs/f2fs/<disk>/ra_nid_pages
|
||||||
Date: October 2015
|
Date: October 2015
|
||||||
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
||||||
Description:
|
Description: Controls the count of nid pages to be readaheaded.
|
||||||
Controls the count of nid pages to be readaheaded.
|
When building free nids, F2FS reads NAT blocks ahead for
|
||||||
|
speed up. Default is 0.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/dirty_nats_ratio
|
What: /sys/fs/f2fs/<disk>/dirty_nats_ratio
|
||||||
Date: January 2016
|
Date: January 2016
|
||||||
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
||||||
Description:
|
Description: Controls dirty nat entries ratio threshold, if current
|
||||||
Controls dirty nat entries ratio threshold, if current
|
ratio exceeds configured threshold, checkpoint will
|
||||||
ratio exceeds configured threshold, checkpoint will
|
be triggered for flushing dirty nat entries.
|
||||||
be triggered for flushing dirty nat entries.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes
|
What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes
|
||||||
Date: January 2016
|
Date: January 2016
|
||||||
Contact: "Shuoran Liu" <liushuoran@huawei.com>
|
Contact: "Shuoran Liu" <liushuoran@huawei.com>
|
||||||
Description:
|
Description: Shows total written kbytes issued to disk.
|
||||||
Shows total written kbytes issued to disk.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/features
|
What: /sys/fs/f2fs/<disk>/features
|
||||||
Date: July 2017
|
Date: July 2017
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Shows all enabled features in current device.
|
||||||
Shows all enabled features in current device.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/inject_rate
|
What: /sys/fs/f2fs/<disk>/inject_rate
|
||||||
Date: May 2016
|
Date: May 2016
|
||||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||||
Description:
|
Description: Controls the injection rate of arbitrary faults.
|
||||||
Controls the injection rate.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/inject_type
|
What: /sys/fs/f2fs/<disk>/inject_type
|
||||||
Date: May 2016
|
Date: May 2016
|
||||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||||
Description:
|
Description: Controls the injection type of arbitrary faults.
|
||||||
Controls the injection type.
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/dirty_segments
|
||||||
|
Date: October 2017
|
||||||
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
|
Description: Shows the number of dirty segments.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/reserved_blocks
|
What: /sys/fs/f2fs/<disk>/reserved_blocks
|
||||||
Date: June 2017
|
Date: June 2017
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Controls target reserved blocks in system, the threshold
|
||||||
Controls target reserved blocks in system, the threshold
|
is soft, it could exceed current available user space.
|
||||||
is soft, it could exceed current available user space.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/current_reserved_blocks
|
What: /sys/fs/f2fs/<disk>/current_reserved_blocks
|
||||||
Date: October 2017
|
Date: October 2017
|
||||||
Contact: "Yunlong Song" <yunlong.song@huawei.com>
|
Contact: "Yunlong Song" <yunlong.song@huawei.com>
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Shows current reserved blocks in system, it may be temporarily
|
||||||
Shows current reserved blocks in system, it may be temporarily
|
smaller than target_reserved_blocks, but will gradually
|
||||||
smaller than target_reserved_blocks, but will gradually
|
increase to target_reserved_blocks when more free blocks are
|
||||||
increase to target_reserved_blocks when more free blocks are
|
freed by user later.
|
||||||
freed by user later.
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_urgent
|
What: /sys/fs/f2fs/<disk>/gc_urgent
|
||||||
Date: August 2017
|
Date: August 2017
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Do background GC agressively when set. When gc_urgent = 1,
|
||||||
Do background GC agressively
|
background thread starts to do GC by given gc_urgent_sleep_time
|
||||||
|
interval. It is set to 0 by default.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
|
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
|
||||||
Date: August 2017
|
Date: August 2017
|
||||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
Description:
|
Description: Controls sleep time of GC urgent mode. Set to 500ms by default.
|
||||||
Controls sleep time of GC urgent mode
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/readdir_ra
|
What: /sys/fs/f2fs/<disk>/readdir_ra
|
||||||
Date: November 2017
|
Date: November 2017
|
||||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||||
Description:
|
Description: Controls readahead inode block in readdir. Enabled by default.
|
||||||
Controls readahead inode block in readdir.
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/gc_pin_file_thresh
|
||||||
|
Date: January 2018
|
||||||
|
Contact: Jaegeuk Kim <jaegeuk@kernel.org>
|
||||||
|
Description: This indicates how many GC can be failed for the pinned
|
||||||
|
file. If it exceeds this, F2FS doesn't guarantee its pinning
|
||||||
|
state. 2048 trials is set by default.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/extension_list
|
What: /sys/fs/f2fs/<disk>/extension_list
|
||||||
Date: Feburary 2018
|
Date: Feburary 2018
|
||||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||||
Description:
|
Description: Used to control configure extension list:
|
||||||
Used to control configure extension list:
|
- Query: cat /sys/fs/f2fs/<disk>/extension_list
|
||||||
- Query: cat /sys/fs/f2fs/<disk>/extension_list
|
- Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||||
- Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
|
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||||
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
|
- [h] means add/del hot file extension
|
||||||
- [h] means add/del hot file extension
|
- [c] means add/del cold file extension
|
||||||
- [c] means add/del cold file extension
|
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/unusable
|
What: /sys/fs/f2fs/<disk>/unusable
|
||||||
Date April 2019
|
Date April 2019
|
||||||
Contact: "Daniel Rosenberg" <drosen@google.com>
|
Contact: "Daniel Rosenberg" <drosen@google.com>
|
||||||
Description:
|
Description: If checkpoint=disable, it displays the number of blocks that
|
||||||
If checkpoint=disable, it displays the number of blocks that are unusable.
|
are unusable.
|
||||||
If checkpoint=enable it displays the enumber of blocks that would be unusable
|
If checkpoint=enable it displays the enumber of blocks that
|
||||||
if checkpoint=disable were to be set.
|
would be unusable if checkpoint=disable were to be set.
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/encoding
|
What: /sys/fs/f2fs/<disk>/encoding
|
||||||
Date July 2019
|
Date July 2019
|
||||||
Contact: "Daniel Rosenberg" <drosen@google.com>
|
Contact: "Daniel Rosenberg" <drosen@google.com>
|
||||||
Description:
|
Description: Displays name and version of the encoding set for the filesystem.
|
||||||
Displays name and version of the encoding set for the filesystem.
|
If no encoding is set, displays (none)
|
||||||
If no encoding is set, displays (none)
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/free_segments
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of free segments in disk.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/cp_foreground_calls
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of checkpoint operations performed on demand. Available when
|
||||||
|
CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/cp_background_calls
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of checkpoint operations performed in the background to
|
||||||
|
free segments. Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/gc_foreground_calls
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of garbage collection operations performed on demand.
|
||||||
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/gc_background_calls
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of garbage collection operations triggered in background.
|
||||||
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/moved_blocks_foreground
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of blocks moved by garbage collection in foreground.
|
||||||
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/moved_blocks_background
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Number of blocks moved by garbage collection in background.
|
||||||
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/avg_vblocks
|
||||||
|
Date: September 2019
|
||||||
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
|
Description: Average number of valid blocks.
|
||||||
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
@@ -407,3 +407,16 @@ Contact: Kalesh Singh <kaleshsingh96@gmail.com>
|
|||||||
Description:
|
Description:
|
||||||
The /sys/power/suspend_stats/last_failed_step file contains
|
The /sys/power/suspend_stats/last_failed_step file contains
|
||||||
the last failed step in the suspend/resume path.
|
the last failed step in the suspend/resume path.
|
||||||
|
|
||||||
|
What: /sys/power/sync_on_suspend
|
||||||
|
Date: October 2019
|
||||||
|
Contact: Jonas Meurer <jonas@freesources.org>
|
||||||
|
Description:
|
||||||
|
This file controls whether or not the kernel will sync()
|
||||||
|
filesystems during system suspend (after freezing user space
|
||||||
|
and before suspending devices).
|
||||||
|
|
||||||
|
Writing a "1" to this file enables the sync() and writing a "0"
|
||||||
|
disables it. Reads from the file return the current value.
|
||||||
|
The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config
|
||||||
|
flag is unset, or "0" otherwise.
|
||||||
|
46
Documentation/ABI/testing/usb-charger-uevent
Normal file
46
Documentation/ABI/testing/usb-charger-uevent
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
What: Raise a uevent when a USB charger is inserted or removed
|
||||||
|
Date: 2020-01-14
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: linux-usb@vger.kernel.org
|
||||||
|
Description: There are two USB charger states:
|
||||||
|
USB_CHARGER_ABSENT
|
||||||
|
USB_CHARGER_PRESENT
|
||||||
|
There are five USB charger types:
|
||||||
|
USB_CHARGER_UNKNOWN_TYPE: Charger type is unknown
|
||||||
|
USB_CHARGER_SDP_TYPE: Standard Downstream Port
|
||||||
|
USB_CHARGER_CDP_TYPE: Charging Downstream Port
|
||||||
|
USB_CHARGER_DCP_TYPE: Dedicated Charging Port
|
||||||
|
USB_CHARGER_ACA_TYPE: Accessory Charging Adapter
|
||||||
|
https://www.usb.org/document-library/battery-charging-v12-spec-and-adopters-agreement
|
||||||
|
|
||||||
|
Here are two examples taken using udevadm monitor -p when
|
||||||
|
USB charger is online:
|
||||||
|
UDEV change /devices/soc0/usbphynop1 (platform)
|
||||||
|
ACTION=change
|
||||||
|
DEVPATH=/devices/soc0/usbphynop1
|
||||||
|
DRIVER=usb_phy_generic
|
||||||
|
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
|
||||||
|
OF_COMPATIBLE_0=usb-nop-xceiv
|
||||||
|
OF_COMPATIBLE_N=1
|
||||||
|
OF_FULLNAME=/usbphynop1
|
||||||
|
OF_NAME=usbphynop1
|
||||||
|
SEQNUM=2493
|
||||||
|
SUBSYSTEM=platform
|
||||||
|
USB_CHARGER_STATE=USB_CHARGER_PRESENT
|
||||||
|
USB_CHARGER_TYPE=USB_CHARGER_SDP_TYPE
|
||||||
|
USEC_INITIALIZED=227422826
|
||||||
|
|
||||||
|
USB charger is offline:
|
||||||
|
KERNEL change /devices/soc0/usbphynop1 (platform)
|
||||||
|
ACTION=change
|
||||||
|
DEVPATH=/devices/soc0/usbphynop1
|
||||||
|
DRIVER=usb_phy_generic
|
||||||
|
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
|
||||||
|
OF_COMPATIBLE_0=usb-nop-xceiv
|
||||||
|
OF_COMPATIBLE_N=1
|
||||||
|
OF_FULLNAME=/usbphynop1
|
||||||
|
OF_NAME=usbphynop1
|
||||||
|
SEQNUM=2494
|
||||||
|
SUBSYSTEM=platform
|
||||||
|
USB_CHARGER_STATE=USB_CHARGER_ABSENT
|
||||||
|
USB_CHARGER_TYPE=USB_CHARGER_UNKNOWN_TYPE
|
@@ -283,5 +283,5 @@ or disabled (0). If 0 is found in any of the msi_bus files belonging
|
|||||||
to bridges between the PCI root and the device, MSIs are disabled.
|
to bridges between the PCI root and the device, MSIs are disabled.
|
||||||
|
|
||||||
It is also worth checking the device driver to see whether it supports MSIs.
|
It is also worth checking the device driver to see whether it supports MSIs.
|
||||||
For example, it may contain calls to pci_irq_alloc_vectors() with the
|
For example, it may contain calls to pci_alloc_irq_vectors() with the
|
||||||
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
|
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
|
||||||
|
@@ -1,4 +1,7 @@
|
|||||||
|
.. _NMI_rcu_doc:
|
||||||
|
|
||||||
Using RCU to Protect Dynamic NMI Handlers
|
Using RCU to Protect Dynamic NMI Handlers
|
||||||
|
=========================================
|
||||||
|
|
||||||
|
|
||||||
Although RCU is usually used to protect read-mostly data structures,
|
Although RCU is usually used to protect read-mostly data structures,
|
||||||
@@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in
|
|||||||
"arch/x86/kernel/traps.c".
|
"arch/x86/kernel/traps.c".
|
||||||
|
|
||||||
The relevant pieces of code are listed below, each followed by a
|
The relevant pieces of code are listed below, each followed by a
|
||||||
brief explanation.
|
brief explanation::
|
||||||
|
|
||||||
static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
|
static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
|
||||||
{
|
{
|
||||||
@@ -18,12 +21,12 @@ brief explanation.
|
|||||||
|
|
||||||
The dummy_nmi_callback() function is a "dummy" NMI handler that does
|
The dummy_nmi_callback() function is a "dummy" NMI handler that does
|
||||||
nothing, but returns zero, thus saying that it did nothing, allowing
|
nothing, but returns zero, thus saying that it did nothing, allowing
|
||||||
the NMI handler to take the default machine-specific action.
|
the NMI handler to take the default machine-specific action::
|
||||||
|
|
||||||
static nmi_callback_t nmi_callback = dummy_nmi_callback;
|
static nmi_callback_t nmi_callback = dummy_nmi_callback;
|
||||||
|
|
||||||
This nmi_callback variable is a global function pointer to the current
|
This nmi_callback variable is a global function pointer to the current
|
||||||
NMI handler.
|
NMI handler::
|
||||||
|
|
||||||
void do_nmi(struct pt_regs * regs, long error_code)
|
void do_nmi(struct pt_regs * regs, long error_code)
|
||||||
{
|
{
|
||||||
@@ -53,11 +56,12 @@ anyway. However, in practice it is a good documentation aid, particularly
|
|||||||
for anyone attempting to do something similar on Alpha or on systems
|
for anyone attempting to do something similar on Alpha or on systems
|
||||||
with aggressive optimizing compilers.
|
with aggressive optimizing compilers.
|
||||||
|
|
||||||
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
|
Quick Quiz:
|
||||||
given that the code referenced by the pointer is read-only?
|
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||||
|
|
||||||
|
:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
|
||||||
|
|
||||||
Back to the discussion of NMI and RCU...
|
Back to the discussion of NMI and RCU::
|
||||||
|
|
||||||
void set_nmi_callback(nmi_callback_t callback)
|
void set_nmi_callback(nmi_callback_t callback)
|
||||||
{
|
{
|
||||||
@@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler. Note that any
|
|||||||
data that is to be used by the callback must be initialized up -before-
|
data that is to be used by the callback must be initialized up -before-
|
||||||
the call to set_nmi_callback(). On architectures that do not order
|
the call to set_nmi_callback(). On architectures that do not order
|
||||||
writes, the rcu_assign_pointer() ensures that the NMI handler sees the
|
writes, the rcu_assign_pointer() ensures that the NMI handler sees the
|
||||||
initialized values.
|
initialized values::
|
||||||
|
|
||||||
void unset_nmi_callback(void)
|
void unset_nmi_callback(void)
|
||||||
{
|
{
|
||||||
@@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution
|
|||||||
of it completes on all other CPUs.
|
of it completes on all other CPUs.
|
||||||
|
|
||||||
One way to accomplish this is via synchronize_rcu(), perhaps as
|
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||||
follows:
|
follows::
|
||||||
|
|
||||||
unset_nmi_callback();
|
unset_nmi_callback();
|
||||||
synchronize_rcu();
|
synchronize_rcu();
|
||||||
@@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns.
|
|||||||
Important note: for this to work, the architecture in question must
|
Important note: for this to work, the architecture in question must
|
||||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||||
|
|
||||||
|
.. _answer_quick_quiz_NMI:
|
||||||
|
|
||||||
Answer to Quick Quiz
|
Answer to Quick Quiz:
|
||||||
|
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||||
|
|
||||||
Why might the rcu_dereference_sched() be necessary on Alpha, given
|
The caller to set_nmi_callback() might well have
|
||||||
that the code referenced by the pointer is read-only?
|
initialized some data that is to be used by the new NMI
|
||||||
|
handler. In this case, the rcu_dereference_sched() would
|
||||||
|
be needed, because otherwise a CPU that received an NMI
|
||||||
|
just after the new handler was set might see the pointer
|
||||||
|
to the new NMI handler, but the old pre-initialized
|
||||||
|
version of the handler's data.
|
||||||
|
|
||||||
Answer: The caller to set_nmi_callback() might well have
|
This same sad story can happen on other CPUs when using
|
||||||
initialized some data that is to be used by the new NMI
|
a compiler with aggressive pointer-value speculation
|
||||||
handler. In this case, the rcu_dereference_sched() would
|
optimizations.
|
||||||
be needed, because otherwise a CPU that received an NMI
|
|
||||||
just after the new handler was set might see the pointer
|
|
||||||
to the new NMI handler, but the old pre-initialized
|
|
||||||
version of the handler's data.
|
|
||||||
|
|
||||||
This same sad story can happen on other CPUs when using
|
More important, the rcu_dereference_sched() makes it
|
||||||
a compiler with aggressive pointer-value speculation
|
clear to someone reading the code that the pointer is
|
||||||
optimizations.
|
being protected by RCU-sched.
|
||||||
|
|
||||||
More important, the rcu_dereference_sched() makes it
|
|
||||||
clear to someone reading the code that the pointer is
|
|
||||||
being protected by RCU-sched.
|
|
@@ -1,19 +1,21 @@
|
|||||||
Using RCU to Protect Read-Mostly Arrays
|
.. _array_rcu_doc:
|
||||||
|
|
||||||
|
Using RCU to Protect Read-Mostly Arrays
|
||||||
|
=======================================
|
||||||
|
|
||||||
Although RCU is more commonly used to protect linked lists, it can
|
Although RCU is more commonly used to protect linked lists, it can
|
||||||
also be used to protect arrays. Three situations are as follows:
|
also be used to protect arrays. Three situations are as follows:
|
||||||
|
|
||||||
1. Hash Tables
|
1. :ref:`Hash Tables <hash_tables>`
|
||||||
|
|
||||||
2. Static Arrays
|
2. :ref:`Static Arrays <static_arrays>`
|
||||||
|
|
||||||
3. Resizeable Arrays
|
3. :ref:`Resizable Arrays <resizable_arrays>`
|
||||||
|
|
||||||
Each of these three situations involves an RCU-protected pointer to an
|
Each of these three situations involves an RCU-protected pointer to an
|
||||||
array that is separately indexed. It might be tempting to consider use
|
array that is separately indexed. It might be tempting to consider use
|
||||||
of RCU to instead protect the index into an array, however, this use
|
of RCU to instead protect the index into an array, however, this use
|
||||||
case is -not- supported. The problem with RCU-protected indexes into
|
case is **not** supported. The problem with RCU-protected indexes into
|
||||||
arrays is that compilers can play way too many optimization games with
|
arrays is that compilers can play way too many optimization games with
|
||||||
integers, which means that the rules governing handling of these indexes
|
integers, which means that the rules governing handling of these indexes
|
||||||
are far more trouble than they are worth. If RCU-protected indexes into
|
are far more trouble than they are worth. If RCU-protected indexes into
|
||||||
@@ -24,16 +26,20 @@ to be safely used.
|
|||||||
That aside, each of the three RCU-protected pointer situations are
|
That aside, each of the three RCU-protected pointer situations are
|
||||||
described in the following sections.
|
described in the following sections.
|
||||||
|
|
||||||
|
.. _hash_tables:
|
||||||
|
|
||||||
Situation 1: Hash Tables
|
Situation 1: Hash Tables
|
||||||
|
------------------------
|
||||||
|
|
||||||
Hash tables are often implemented as an array, where each array entry
|
Hash tables are often implemented as an array, where each array entry
|
||||||
has a linked-list hash chain. Each hash chain can be protected by RCU
|
has a linked-list hash chain. Each hash chain can be protected by RCU
|
||||||
as described in the listRCU.txt document. This approach also applies
|
as described in the listRCU.txt document. This approach also applies
|
||||||
to other array-of-list situations, such as radix trees.
|
to other array-of-list situations, such as radix trees.
|
||||||
|
|
||||||
|
.. _static_arrays:
|
||||||
|
|
||||||
Situation 2: Static Arrays
|
Situation 2: Static Arrays
|
||||||
|
--------------------------
|
||||||
|
|
||||||
Static arrays, where the data (rather than a pointer to the data) is
|
Static arrays, where the data (rather than a pointer to the data) is
|
||||||
located in each array element, and where the array is never resized,
|
located in each array element, and where the array is never resized,
|
||||||
@@ -41,13 +47,17 @@ have not been used with RCU. Rik van Riel recommends using seqlock in
|
|||||||
this situation, which would also have minimal read-side overhead as long
|
this situation, which would also have minimal read-side overhead as long
|
||||||
as updates are rare.
|
as updates are rare.
|
||||||
|
|
||||||
Quick Quiz: Why is it so important that updates be rare when
|
Quick Quiz:
|
||||||
using seqlock?
|
Why is it so important that updates be rare when using seqlock?
|
||||||
|
|
||||||
|
:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
|
||||||
|
|
||||||
Situation 3: Resizeable Arrays
|
.. _resizable_arrays:
|
||||||
|
|
||||||
Use of RCU for resizeable arrays is demonstrated by the grow_ary()
|
Situation 3: Resizable Arrays
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
Use of RCU for resizable arrays is demonstrated by the grow_ary()
|
||||||
function formerly used by the System V IPC code. The array is used
|
function formerly used by the System V IPC code. The array is used
|
||||||
to map from semaphore, message-queue, and shared-memory IDs to the data
|
to map from semaphore, message-queue, and shared-memory IDs to the data
|
||||||
structure that represents the corresponding IPC construct. The grow_ary()
|
structure that represents the corresponding IPC construct. The grow_ary()
|
||||||
@@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to
|
|||||||
the new array, and invokes ipc_rcu_putref() to free up the old array.
|
the new array, and invokes ipc_rcu_putref() to free up the old array.
|
||||||
Note that rcu_assign_pointer() is used to update the ids->entries pointer,
|
Note that rcu_assign_pointer() is used to update the ids->entries pointer,
|
||||||
which includes any memory barriers required on whatever architecture
|
which includes any memory barriers required on whatever architecture
|
||||||
you are running on.
|
you are running on::
|
||||||
|
|
||||||
static int grow_ary(struct ipc_ids* ids, int newsize)
|
static int grow_ary(struct ipc_ids* ids, int newsize)
|
||||||
{
|
{
|
||||||
@@ -112,7 +122,7 @@ a simple check suffices. The pointer to the structure corresponding
|
|||||||
to the desired IPC object is placed in "out", with NULL indicating
|
to the desired IPC object is placed in "out", with NULL indicating
|
||||||
a non-existent entry. After acquiring "out->lock", the "out->deleted"
|
a non-existent entry. After acquiring "out->lock", the "out->deleted"
|
||||||
flag indicates whether the IPC object is in the process of being
|
flag indicates whether the IPC object is in the process of being
|
||||||
deleted, and, if not, the pointer is returned.
|
deleted, and, if not, the pointer is returned::
|
||||||
|
|
||||||
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
|
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
|
||||||
{
|
{
|
||||||
@@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned.
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.. _answer_quick_quiz_seqlock:
|
||||||
|
|
||||||
Answer to Quick Quiz:
|
Answer to Quick Quiz:
|
||||||
|
Why is it so important that updates be rare when using seqlock?
|
||||||
|
|
||||||
The reason that it is important that updates be rare when
|
The reason that it is important that updates be rare when
|
||||||
using seqlock is that frequent updates can livelock readers.
|
using seqlock is that frequent updates can livelock readers.
|
@@ -7,8 +7,13 @@ RCU concepts
|
|||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 3
|
:maxdepth: 3
|
||||||
|
|
||||||
|
arrayRCU
|
||||||
|
rcubarrier
|
||||||
|
rcu_dereference
|
||||||
|
whatisRCU
|
||||||
rcu
|
rcu
|
||||||
listRCU
|
listRCU
|
||||||
|
NMI-RCU
|
||||||
UP
|
UP
|
||||||
|
|
||||||
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
|
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
|
||||||
|
@@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU
|
|||||||
read-side critical section, which again would have suppressed the
|
read-side critical section, which again would have suppressed the
|
||||||
above lockdep-RCU splat.
|
above lockdep-RCU splat.
|
||||||
|
|
||||||
But in this particular case, we don't actually deference the pointer
|
But in this particular case, we don't actually dereference the pointer
|
||||||
returned from rcu_dereference(). Instead, that pointer is just compared
|
returned from rcu_dereference(). Instead, that pointer is just compared
|
||||||
to the cic pointer, which means that the rcu_dereference() can be replaced
|
to the cic pointer, which means that the rcu_dereference() can be replaced
|
||||||
by rcu_access_pointer() as follows:
|
by rcu_access_pointer() as follows:
|
||||||
|
@@ -1,4 +1,7 @@
|
|||||||
|
.. _rcu_dereference_doc:
|
||||||
|
|
||||||
PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
|
PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
|
||||||
|
===============================================================
|
||||||
|
|
||||||
Most of the time, you can use values from rcu_dereference() or one of
|
Most of the time, you can use values from rcu_dereference() or one of
|
||||||
the similar primitives without worries. Dereferencing (prefix "*"),
|
the similar primitives without worries. Dereferencing (prefix "*"),
|
||||||
@@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely.
|
|||||||
It is nevertheless possible to get into trouble with other operations.
|
It is nevertheless possible to get into trouble with other operations.
|
||||||
Follow these rules to keep your RCU code working properly:
|
Follow these rules to keep your RCU code working properly:
|
||||||
|
|
||||||
o You must use one of the rcu_dereference() family of primitives
|
- You must use one of the rcu_dereference() family of primitives
|
||||||
to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
|
to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
|
||||||
will complain. Worse yet, your code can see random memory-corruption
|
will complain. Worse yet, your code can see random memory-corruption
|
||||||
bugs due to games that compilers and DEC Alpha can play.
|
bugs due to games that compilers and DEC Alpha can play.
|
||||||
@@ -25,24 +28,24 @@ o You must use one of the rcu_dereference() family of primitives
|
|||||||
for an example where the compiler can in fact deduce the exact
|
for an example where the compiler can in fact deduce the exact
|
||||||
value of the pointer, and thus cause misordering.
|
value of the pointer, and thus cause misordering.
|
||||||
|
|
||||||
o You are only permitted to use rcu_dereference on pointer values.
|
- You are only permitted to use rcu_dereference on pointer values.
|
||||||
The compiler simply knows too much about integral values to
|
The compiler simply knows too much about integral values to
|
||||||
trust it to carry dependencies through integer operations.
|
trust it to carry dependencies through integer operations.
|
||||||
There are a very few exceptions, namely that you can temporarily
|
There are a very few exceptions, namely that you can temporarily
|
||||||
cast the pointer to uintptr_t in order to:
|
cast the pointer to uintptr_t in order to:
|
||||||
|
|
||||||
o Set bits and clear bits down in the must-be-zero low-order
|
- Set bits and clear bits down in the must-be-zero low-order
|
||||||
bits of that pointer. This clearly means that the pointer
|
bits of that pointer. This clearly means that the pointer
|
||||||
must have alignment constraints, for example, this does
|
must have alignment constraints, for example, this does
|
||||||
-not- work in general for char* pointers.
|
-not- work in general for char* pointers.
|
||||||
|
|
||||||
o XOR bits to translate pointers, as is done in some
|
- XOR bits to translate pointers, as is done in some
|
||||||
classic buddy-allocator algorithms.
|
classic buddy-allocator algorithms.
|
||||||
|
|
||||||
It is important to cast the value back to pointer before
|
It is important to cast the value back to pointer before
|
||||||
doing much of anything else with it.
|
doing much of anything else with it.
|
||||||
|
|
||||||
o Avoid cancellation when using the "+" and "-" infix arithmetic
|
- Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||||
operators. For example, for a given variable "x", avoid
|
operators. For example, for a given variable "x", avoid
|
||||||
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its
|
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its
|
||||||
rights to substitute zero for this sort of expression, so that
|
rights to substitute zero for this sort of expression, so that
|
||||||
@@ -54,16 +57,16 @@ o Avoid cancellation when using the "+" and "-" infix arithmetic
|
|||||||
"p+a-b" is safe because its value still necessarily depends on
|
"p+a-b" is safe because its value still necessarily depends on
|
||||||
the rcu_dereference(), thus maintaining proper ordering.
|
the rcu_dereference(), thus maintaining proper ordering.
|
||||||
|
|
||||||
o If you are using RCU to protect JITed functions, so that the
|
- If you are using RCU to protect JITed functions, so that the
|
||||||
"()" function-invocation operator is applied to a value obtained
|
"()" function-invocation operator is applied to a value obtained
|
||||||
(directly or indirectly) from rcu_dereference(), you may need to
|
(directly or indirectly) from rcu_dereference(), you may need to
|
||||||
interact directly with the hardware to flush instruction caches.
|
interact directly with the hardware to flush instruction caches.
|
||||||
This issue arises on some systems when a newly JITed function is
|
This issue arises on some systems when a newly JITed function is
|
||||||
using the same memory that was used by an earlier JITed function.
|
using the same memory that was used by an earlier JITed function.
|
||||||
|
|
||||||
o Do not use the results from relational operators ("==", "!=",
|
- Do not use the results from relational operators ("==", "!=",
|
||||||
">", ">=", "<", or "<=") when dereferencing. For example,
|
">", ">=", "<", or "<=") when dereferencing. For example,
|
||||||
the following (quite strange) code is buggy:
|
the following (quite strange) code is buggy::
|
||||||
|
|
||||||
int *p;
|
int *p;
|
||||||
int *q;
|
int *q;
|
||||||
@@ -81,11 +84,11 @@ o Do not use the results from relational operators ("==", "!=",
|
|||||||
after such branches, but can speculate loads, which can again
|
after such branches, but can speculate loads, which can again
|
||||||
result in misordering bugs.
|
result in misordering bugs.
|
||||||
|
|
||||||
o Be very careful about comparing pointers obtained from
|
- Be very careful about comparing pointers obtained from
|
||||||
rcu_dereference() against non-NULL values. As Linus Torvalds
|
rcu_dereference() against non-NULL values. As Linus Torvalds
|
||||||
explained, if the two pointers are equal, the compiler could
|
explained, if the two pointers are equal, the compiler could
|
||||||
substitute the pointer you are comparing against for the pointer
|
substitute the pointer you are comparing against for the pointer
|
||||||
obtained from rcu_dereference(). For example:
|
obtained from rcu_dereference(). For example::
|
||||||
|
|
||||||
p = rcu_dereference(gp);
|
p = rcu_dereference(gp);
|
||||||
if (p == &default_struct)
|
if (p == &default_struct)
|
||||||
@@ -93,7 +96,7 @@ o Be very careful about comparing pointers obtained from
|
|||||||
|
|
||||||
Because the compiler now knows that the value of "p" is exactly
|
Because the compiler now knows that the value of "p" is exactly
|
||||||
the address of the variable "default_struct", it is free to
|
the address of the variable "default_struct", it is free to
|
||||||
transform this code into the following:
|
transform this code into the following::
|
||||||
|
|
||||||
p = rcu_dereference(gp);
|
p = rcu_dereference(gp);
|
||||||
if (p == &default_struct)
|
if (p == &default_struct)
|
||||||
@@ -105,14 +108,14 @@ o Be very careful about comparing pointers obtained from
|
|||||||
|
|
||||||
However, comparisons are OK in the following cases:
|
However, comparisons are OK in the following cases:
|
||||||
|
|
||||||
o The comparison was against the NULL pointer. If the
|
- The comparison was against the NULL pointer. If the
|
||||||
compiler knows that the pointer is NULL, you had better
|
compiler knows that the pointer is NULL, you had better
|
||||||
not be dereferencing it anyway. If the comparison is
|
not be dereferencing it anyway. If the comparison is
|
||||||
non-equal, the compiler is none the wiser. Therefore,
|
non-equal, the compiler is none the wiser. Therefore,
|
||||||
it is safe to compare pointers from rcu_dereference()
|
it is safe to compare pointers from rcu_dereference()
|
||||||
against NULL pointers.
|
against NULL pointers.
|
||||||
|
|
||||||
o The pointer is never dereferenced after being compared.
|
- The pointer is never dereferenced after being compared.
|
||||||
Since there are no subsequent dereferences, the compiler
|
Since there are no subsequent dereferences, the compiler
|
||||||
cannot use anything it learned from the comparison
|
cannot use anything it learned from the comparison
|
||||||
to reorder the non-existent subsequent dereferences.
|
to reorder the non-existent subsequent dereferences.
|
||||||
@@ -124,31 +127,31 @@ o Be very careful about comparing pointers obtained from
|
|||||||
dereferenced, rcu_access_pointer() should be used in place
|
dereferenced, rcu_access_pointer() should be used in place
|
||||||
of rcu_dereference().
|
of rcu_dereference().
|
||||||
|
|
||||||
o The comparison is against a pointer that references memory
|
- The comparison is against a pointer that references memory
|
||||||
that was initialized "a long time ago." The reason
|
that was initialized "a long time ago." The reason
|
||||||
this is safe is that even if misordering occurs, the
|
this is safe is that even if misordering occurs, the
|
||||||
misordering will not affect the accesses that follow
|
misordering will not affect the accesses that follow
|
||||||
the comparison. So exactly how long ago is "a long
|
the comparison. So exactly how long ago is "a long
|
||||||
time ago"? Here are some possibilities:
|
time ago"? Here are some possibilities:
|
||||||
|
|
||||||
o Compile time.
|
- Compile time.
|
||||||
|
|
||||||
o Boot time.
|
- Boot time.
|
||||||
|
|
||||||
o Module-init time for module code.
|
- Module-init time for module code.
|
||||||
|
|
||||||
o Prior to kthread creation for kthread code.
|
- Prior to kthread creation for kthread code.
|
||||||
|
|
||||||
o During some prior acquisition of the lock that
|
- During some prior acquisition of the lock that
|
||||||
we now hold.
|
we now hold.
|
||||||
|
|
||||||
o Before mod_timer() time for a timer handler.
|
- Before mod_timer() time for a timer handler.
|
||||||
|
|
||||||
There are many other possibilities involving the Linux
|
There are many other possibilities involving the Linux
|
||||||
kernel's wide array of primitives that cause code to
|
kernel's wide array of primitives that cause code to
|
||||||
be invoked at a later time.
|
be invoked at a later time.
|
||||||
|
|
||||||
o The pointer being compared against also came from
|
- The pointer being compared against also came from
|
||||||
rcu_dereference(). In this case, both pointers depend
|
rcu_dereference(). In this case, both pointers depend
|
||||||
on one rcu_dereference() or another, so you get proper
|
on one rcu_dereference() or another, so you get proper
|
||||||
ordering either way.
|
ordering either way.
|
||||||
@@ -159,13 +162,13 @@ o Be very careful about comparing pointers obtained from
|
|||||||
of such an RCU usage bug is shown in the section titled
|
of such an RCU usage bug is shown in the section titled
|
||||||
"EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
|
"EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
|
||||||
|
|
||||||
o All of the accesses following the comparison are stores,
|
- All of the accesses following the comparison are stores,
|
||||||
so that a control dependency preserves the needed ordering.
|
so that a control dependency preserves the needed ordering.
|
||||||
That said, it is easy to get control dependencies wrong.
|
That said, it is easy to get control dependencies wrong.
|
||||||
Please see the "CONTROL DEPENDENCIES" section of
|
Please see the "CONTROL DEPENDENCIES" section of
|
||||||
Documentation/memory-barriers.txt for more details.
|
Documentation/memory-barriers.txt for more details.
|
||||||
|
|
||||||
o The pointers are not equal -and- the compiler does
|
- The pointers are not equal -and- the compiler does
|
||||||
not have enough information to deduce the value of the
|
not have enough information to deduce the value of the
|
||||||
pointer. Note that the volatile cast in rcu_dereference()
|
pointer. Note that the volatile cast in rcu_dereference()
|
||||||
will normally prevent the compiler from knowing too much.
|
will normally prevent the compiler from knowing too much.
|
||||||
@@ -175,7 +178,7 @@ o Be very careful about comparing pointers obtained from
|
|||||||
comparison will provide exactly the information that the
|
comparison will provide exactly the information that the
|
||||||
compiler needs to deduce the value of the pointer.
|
compiler needs to deduce the value of the pointer.
|
||||||
|
|
||||||
o Disable any value-speculation optimizations that your compiler
|
- Disable any value-speculation optimizations that your compiler
|
||||||
might provide, especially if you are making use of feedback-based
|
might provide, especially if you are making use of feedback-based
|
||||||
optimizations that take data collected from prior runs. Such
|
optimizations that take data collected from prior runs. Such
|
||||||
value-speculation optimizations reorder operations by design.
|
value-speculation optimizations reorder operations by design.
|
||||||
@@ -188,11 +191,12 @@ o Disable any value-speculation optimizations that your compiler
|
|||||||
|
|
||||||
|
|
||||||
EXAMPLE OF AMPLIFIED RCU-USAGE BUG
|
EXAMPLE OF AMPLIFIED RCU-USAGE BUG
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
Because updaters can run concurrently with RCU readers, RCU readers can
|
Because updaters can run concurrently with RCU readers, RCU readers can
|
||||||
see stale and/or inconsistent values. If RCU readers need fresh or
|
see stale and/or inconsistent values. If RCU readers need fresh or
|
||||||
consistent values, which they sometimes do, they need to take proper
|
consistent values, which they sometimes do, they need to take proper
|
||||||
precautions. To see this, consider the following code fragment:
|
precautions. To see this, consider the following code fragment::
|
||||||
|
|
||||||
struct foo {
|
struct foo {
|
||||||
int a;
|
int a;
|
||||||
@@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point.
|
|||||||
|
|
||||||
But suppose that the reader needs a consistent view?
|
But suppose that the reader needs a consistent view?
|
||||||
|
|
||||||
Then one approach is to use locking, for example, as follows:
|
Then one approach is to use locking, for example, as follows::
|
||||||
|
|
||||||
struct foo {
|
struct foo {
|
||||||
int a;
|
int a;
|
||||||
@@ -299,6 +303,7 @@ As always, use the right tool for the job!
|
|||||||
|
|
||||||
|
|
||||||
EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
|
EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
If a pointer obtained from rcu_dereference() compares not-equal to some
|
If a pointer obtained from rcu_dereference() compares not-equal to some
|
||||||
other pointer, the compiler normally has no clue what the value of the
|
other pointer, the compiler normally has no clue what the value of the
|
||||||
@@ -308,7 +313,7 @@ guarantees that RCU depends on. And the volatile cast in rcu_dereference()
|
|||||||
should prevent the compiler from guessing the value.
|
should prevent the compiler from guessing the value.
|
||||||
|
|
||||||
But without rcu_dereference(), the compiler knows more than you might
|
But without rcu_dereference(), the compiler knows more than you might
|
||||||
expect. Consider the following code fragment:
|
expect. Consider the following code fragment::
|
||||||
|
|
||||||
struct foo {
|
struct foo {
|
||||||
int a;
|
int a;
|
||||||
@@ -354,6 +359,7 @@ dereference the resulting pointer.
|
|||||||
|
|
||||||
|
|
||||||
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||||
|
------------------------------------------------------------
|
||||||
|
|
||||||
First, please avoid using rcu_dereference_raw() and also please avoid
|
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||||
using rcu_dereference_check() and rcu_dereference_protected() with a
|
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||||
@@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations:
|
|||||||
|
|
||||||
2. If the access might be within an RCU read-side critical section
|
2. If the access might be within an RCU read-side critical section
|
||||||
on the one hand, or protected by (say) my_lock on the other,
|
on the one hand, or protected by (say) my_lock on the other,
|
||||||
use rcu_dereference_check(), for example:
|
use rcu_dereference_check(), for example::
|
||||||
|
|
||||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
lockdep_is_held(&my_lock));
|
lockdep_is_held(&my_lock));
|
||||||
@@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations:
|
|||||||
|
|
||||||
3. If the access might be within an RCU read-side critical section
|
3. If the access might be within an RCU read-side critical section
|
||||||
on the one hand, or protected by either my_lock or your_lock on
|
on the one hand, or protected by either my_lock or your_lock on
|
||||||
the other, again use rcu_dereference_check(), for example:
|
the other, again use rcu_dereference_check(), for example::
|
||||||
|
|
||||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
lockdep_is_held(&my_lock) ||
|
lockdep_is_held(&my_lock) ||
|
||||||
lockdep_is_held(&your_lock));
|
lockdep_is_held(&your_lock));
|
||||||
|
|
||||||
4. If the access is on the update side, so that it is always protected
|
4. If the access is on the update side, so that it is always protected
|
||||||
by my_lock, use rcu_dereference_protected():
|
by my_lock, use rcu_dereference_protected()::
|
||||||
|
|
||||||
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||||
lockdep_is_held(&my_lock));
|
lockdep_is_held(&my_lock));
|
||||||
@@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations:
|
|||||||
|
|
||||||
|
|
||||||
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
The sparse static-analysis tool checks for direct access to RCU-protected
|
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||||
pointers, which can result in "interesting" bugs due to compiler
|
pointers, which can result in "interesting" bugs due to compiler
|
||||||
optimizations involving invented loads and perhaps also load tearing.
|
optimizations involving invented loads and perhaps also load tearing.
|
||||||
For example, suppose someone mistakenly does something like this:
|
For example, suppose someone mistakenly does something like this::
|
||||||
|
|
||||||
p = q->rcu_protected_pointer;
|
p = q->rcu_protected_pointer;
|
||||||
do_something_with(p->a);
|
do_something_with(p->a);
|
||||||
do_something_else_with(p->b);
|
do_something_else_with(p->b);
|
||||||
|
|
||||||
If register pressure is high, the compiler might optimize "p" out
|
If register pressure is high, the compiler might optimize "p" out
|
||||||
of existence, transforming the code to something like this:
|
of existence, transforming the code to something like this::
|
||||||
|
|
||||||
do_something_with(q->rcu_protected_pointer->a);
|
do_something_with(q->rcu_protected_pointer->a);
|
||||||
do_something_else_with(q->rcu_protected_pointer->b);
|
do_something_else_with(q->rcu_protected_pointer->b);
|
||||||
@@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair
|
|||||||
of pointers, which also might fatally disappoint your code.
|
of pointers, which also might fatally disappoint your code.
|
||||||
|
|
||||||
These problems could have been avoided simply by making the code instead
|
These problems could have been avoided simply by making the code instead
|
||||||
read as follows:
|
read as follows::
|
||||||
|
|
||||||
p = rcu_dereference(q->rcu_protected_pointer);
|
p = rcu_dereference(q->rcu_protected_pointer);
|
||||||
do_something_with(p->a);
|
do_something_with(p->a);
|
||||||
@@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if
|
|||||||
this pointer is accessed directly. It will also cause sparse to complain
|
this pointer is accessed directly. It will also cause sparse to complain
|
||||||
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||||
and friends. For example, ->rcu_protected_pointer might be declared as
|
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||||
follows:
|
follows::
|
||||||
|
|
||||||
struct foo __rcu *rcu_protected_pointer;
|
struct foo __rcu *rcu_protected_pointer;
|
||||||
|
|
@@ -1,4 +1,7 @@
|
|||||||
|
.. _rcu_barrier:
|
||||||
|
|
||||||
RCU and Unloadable Modules
|
RCU and Unloadable Modules
|
||||||
|
==========================
|
||||||
|
|
||||||
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
|
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
|
||||||
|
|
||||||
@@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their
|
|||||||
presence? There is a synchronize_rcu() primitive that blocks until all
|
presence? There is a synchronize_rcu() primitive that blocks until all
|
||||||
pre-existing readers have completed. An updater wishing to delete an
|
pre-existing readers have completed. An updater wishing to delete an
|
||||||
element p from a linked list might do the following, while holding an
|
element p from a linked list might do the following, while holding an
|
||||||
appropriate lock, of course:
|
appropriate lock, of course::
|
||||||
|
|
||||||
list_del_rcu(p);
|
list_del_rcu(p);
|
||||||
synchronize_rcu();
|
synchronize_rcu();
|
||||||
@@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an
|
|||||||
rcu_head struct placed within the RCU-protected data structure and
|
rcu_head struct placed within the RCU-protected data structure and
|
||||||
another pointer to a function that may be invoked later to free that
|
another pointer to a function that may be invoked later to free that
|
||||||
structure. Code to delete an element p from the linked list from IRQ
|
structure. Code to delete an element p from the linked list from IRQ
|
||||||
context might then be as follows:
|
context might then be as follows::
|
||||||
|
|
||||||
list_del_rcu(p);
|
list_del_rcu(p);
|
||||||
call_rcu(&p->rcu, p_callback);
|
call_rcu(&p->rcu, p_callback);
|
||||||
|
|
||||||
Since call_rcu() never blocks, this code can safely be used from within
|
Since call_rcu() never blocks, this code can safely be used from within
|
||||||
IRQ context. The function p_callback() might be defined as follows:
|
IRQ context. The function p_callback() might be defined as follows::
|
||||||
|
|
||||||
static void p_callback(struct rcu_head *rp)
|
static void p_callback(struct rcu_head *rp)
|
||||||
{
|
{
|
||||||
@@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows:
|
|||||||
|
|
||||||
|
|
||||||
Unloading Modules That Use call_rcu()
|
Unloading Modules That Use call_rcu()
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
But what if p_callback is defined in an unloadable module?
|
But what if p_callback is defined in an unloadable module?
|
||||||
|
|
||||||
@@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies.
|
|||||||
|
|
||||||
|
|
||||||
rcu_barrier()
|
rcu_barrier()
|
||||||
|
-------------
|
||||||
|
|
||||||
We instead need the rcu_barrier() primitive. Rather than waiting for
|
We instead need the rcu_barrier() primitive. Rather than waiting for
|
||||||
a grace period to elapse, rcu_barrier() waits for all outstanding RCU
|
a grace period to elapse, rcu_barrier() waits for all outstanding RCU
|
||||||
callbacks to complete. Please note that rcu_barrier() does -not- imply
|
callbacks to complete. Please note that rcu_barrier() does **not** imply
|
||||||
synchronize_rcu(), in particular, if there are no RCU callbacks queued
|
synchronize_rcu(), in particular, if there are no RCU callbacks queued
|
||||||
anywhere, rcu_barrier() is within its rights to return immediately,
|
anywhere, rcu_barrier() is within its rights to return immediately,
|
||||||
without waiting for a grace period to elapse.
|
without waiting for a grace period to elapse.
|
||||||
@@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
|||||||
module uses multiple flavors of call_rcu(), then it must also use multiple
|
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||||
flavors of rcu_barrier() when unloading that module. For example, if
|
flavors of rcu_barrier() when unloading that module. For example, if
|
||||||
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||||
srcu_struct_2(), then the following three lines of code will be required
|
srcu_struct_2, then the following three lines of code will be required
|
||||||
when unloading:
|
when unloading::
|
||||||
|
|
||||||
1 rcu_barrier();
|
1 rcu_barrier();
|
||||||
2 srcu_barrier(&srcu_struct_1);
|
2 srcu_barrier(&srcu_struct_1);
|
||||||
3 srcu_barrier(&srcu_struct_2);
|
3 srcu_barrier(&srcu_struct_2);
|
||||||
|
|
||||||
The rcutorture module makes use of rcu_barrier() in its exit function
|
The rcutorture module makes use of rcu_barrier() in its exit function
|
||||||
as follows:
|
as follows::
|
||||||
|
|
||||||
1 static void
|
1 static void
|
||||||
2 rcu_torture_cleanup(void)
|
2 rcu_torture_cleanup(void)
|
||||||
3 {
|
3 {
|
||||||
4 int i;
|
4 int i;
|
||||||
5
|
5
|
||||||
6 fullstop = 1;
|
6 fullstop = 1;
|
||||||
7 if (shuffler_task != NULL) {
|
7 if (shuffler_task != NULL) {
|
||||||
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
||||||
9 kthread_stop(shuffler_task);
|
9 kthread_stop(shuffler_task);
|
||||||
10 }
|
10 }
|
||||||
11 shuffler_task = NULL;
|
11 shuffler_task = NULL;
|
||||||
12
|
12
|
||||||
13 if (writer_task != NULL) {
|
13 if (writer_task != NULL) {
|
||||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||||
15 kthread_stop(writer_task);
|
15 kthread_stop(writer_task);
|
||||||
16 }
|
16 }
|
||||||
17 writer_task = NULL;
|
17 writer_task = NULL;
|
||||||
18
|
18
|
||||||
19 if (reader_tasks != NULL) {
|
19 if (reader_tasks != NULL) {
|
||||||
20 for (i = 0; i < nrealreaders; i++) {
|
20 for (i = 0; i < nrealreaders; i++) {
|
||||||
21 if (reader_tasks[i] != NULL) {
|
21 if (reader_tasks[i] != NULL) {
|
||||||
22 VERBOSE_PRINTK_STRING(
|
22 VERBOSE_PRINTK_STRING(
|
||||||
23 "Stopping rcu_torture_reader task");
|
23 "Stopping rcu_torture_reader task");
|
||||||
24 kthread_stop(reader_tasks[i]);
|
24 kthread_stop(reader_tasks[i]);
|
||||||
25 }
|
25 }
|
||||||
26 reader_tasks[i] = NULL;
|
26 reader_tasks[i] = NULL;
|
||||||
27 }
|
27 }
|
||||||
28 kfree(reader_tasks);
|
28 kfree(reader_tasks);
|
||||||
29 reader_tasks = NULL;
|
29 reader_tasks = NULL;
|
||||||
30 }
|
30 }
|
||||||
31 rcu_torture_current = NULL;
|
31 rcu_torture_current = NULL;
|
||||||
32
|
32
|
||||||
33 if (fakewriter_tasks != NULL) {
|
33 if (fakewriter_tasks != NULL) {
|
||||||
34 for (i = 0; i < nfakewriters; i++) {
|
34 for (i = 0; i < nfakewriters; i++) {
|
||||||
35 if (fakewriter_tasks[i] != NULL) {
|
35 if (fakewriter_tasks[i] != NULL) {
|
||||||
36 VERBOSE_PRINTK_STRING(
|
36 VERBOSE_PRINTK_STRING(
|
||||||
37 "Stopping rcu_torture_fakewriter task");
|
37 "Stopping rcu_torture_fakewriter task");
|
||||||
38 kthread_stop(fakewriter_tasks[i]);
|
38 kthread_stop(fakewriter_tasks[i]);
|
||||||
39 }
|
39 }
|
||||||
40 fakewriter_tasks[i] = NULL;
|
40 fakewriter_tasks[i] = NULL;
|
||||||
41 }
|
41 }
|
||||||
42 kfree(fakewriter_tasks);
|
42 kfree(fakewriter_tasks);
|
||||||
43 fakewriter_tasks = NULL;
|
43 fakewriter_tasks = NULL;
|
||||||
44 }
|
44 }
|
||||||
45
|
45
|
||||||
46 if (stats_task != NULL) {
|
46 if (stats_task != NULL) {
|
||||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||||
48 kthread_stop(stats_task);
|
48 kthread_stop(stats_task);
|
||||||
49 }
|
49 }
|
||||||
50 stats_task = NULL;
|
50 stats_task = NULL;
|
||||||
51
|
51
|
||||||
52 /* Wait for all RCU callbacks to fire. */
|
52 /* Wait for all RCU callbacks to fire. */
|
||||||
53 rcu_barrier();
|
53 rcu_barrier();
|
||||||
54
|
54
|
||||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||||
56
|
56
|
||||||
57 if (cur_ops->cleanup != NULL)
|
57 if (cur_ops->cleanup != NULL)
|
||||||
58 cur_ops->cleanup();
|
58 cur_ops->cleanup();
|
||||||
59 if (atomic_read(&n_rcu_torture_error))
|
59 if (atomic_read(&n_rcu_torture_error))
|
||||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||||
61 else
|
61 else
|
||||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||||
63 }
|
63 }
|
||||||
|
|
||||||
Line 6 sets a global variable that prevents any RCU callbacks from
|
Line 6 sets a global variable that prevents any RCU callbacks from
|
||||||
re-posting themselves. This will not be necessary in most cases, since
|
re-posting themselves. This will not be necessary in most cases, since
|
||||||
@@ -176,9 +181,14 @@ for any pre-existing callbacks to complete.
|
|||||||
Then lines 55-62 print status and do operation-specific cleanup, and
|
Then lines 55-62 print status and do operation-specific cleanup, and
|
||||||
then return, permitting the module-unload operation to be completed.
|
then return, permitting the module-unload operation to be completed.
|
||||||
|
|
||||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
.. _rcubarrier_quiz_1:
|
||||||
|
|
||||||
|
Quick Quiz #1:
|
||||||
|
Is there any other situation where rcu_barrier() might
|
||||||
be required?
|
be required?
|
||||||
|
|
||||||
|
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
|
||||||
|
|
||||||
Your module might have additional complications. For example, if your
|
Your module might have additional complications. For example, if your
|
||||||
module invokes call_rcu() from timers, you will need to first cancel all
|
module invokes call_rcu() from timers, you will need to first cancel all
|
||||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||||
@@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke
|
|||||||
rcu_barrier() before unloading. Similarly, if your module uses
|
rcu_barrier() before unloading. Similarly, if your module uses
|
||||||
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||||
and on the same srcu_struct structure. If your module uses call_rcu()
|
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||||
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
|
||||||
srcu_barrier().
|
srcu_barrier().
|
||||||
|
|
||||||
|
|
||||||
Implementing rcu_barrier()
|
Implementing rcu_barrier()
|
||||||
|
--------------------------
|
||||||
|
|
||||||
Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
|
Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
|
||||||
that RCU callbacks are never reordered once queued on one of the per-CPU
|
that RCU callbacks are never reordered once queued on one of the per-CPU
|
||||||
@@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU
|
|||||||
callback queues, and then waits until they have all started executing, at
|
callback queues, and then waits until they have all started executing, at
|
||||||
which point, all earlier RCU callbacks are guaranteed to have completed.
|
which point, all earlier RCU callbacks are guaranteed to have completed.
|
||||||
|
|
||||||
The original code for rcu_barrier() was as follows:
|
The original code for rcu_barrier() was as follows::
|
||||||
|
|
||||||
1 void rcu_barrier(void)
|
1 void rcu_barrier(void)
|
||||||
2 {
|
2 {
|
||||||
3 BUG_ON(in_interrupt());
|
3 BUG_ON(in_interrupt());
|
||||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||||
5 mutex_lock(&rcu_barrier_mutex);
|
5 mutex_lock(&rcu_barrier_mutex);
|
||||||
6 init_completion(&rcu_barrier_completion);
|
6 init_completion(&rcu_barrier_completion);
|
||||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||||
9 wait_for_completion(&rcu_barrier_completion);
|
9 wait_for_completion(&rcu_barrier_completion);
|
||||||
10 mutex_unlock(&rcu_barrier_mutex);
|
10 mutex_unlock(&rcu_barrier_mutex);
|
||||||
11 }
|
11 }
|
||||||
|
|
||||||
Line 3 verifies that the caller is in process context, and lines 5 and 10
|
Line 3 verifies that the caller is in process context, and lines 5 and 10
|
||||||
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
|
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
|
||||||
@@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this
|
|||||||
still gives the general idea.
|
still gives the general idea.
|
||||||
|
|
||||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||||
to post an RCU callback, as follows:
|
to post an RCU callback, as follows::
|
||||||
|
|
||||||
1 static void rcu_barrier_func(void *notused)
|
1 static void rcu_barrier_func(void *notused)
|
||||||
2 {
|
2 {
|
||||||
3 int cpu = smp_processor_id();
|
3 int cpu = smp_processor_id();
|
||||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||||
5 struct rcu_head *head;
|
5 struct rcu_head *head;
|
||||||
6
|
6
|
||||||
7 head = &rdp->barrier;
|
7 head = &rdp->barrier;
|
||||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||||
9 call_rcu(head, rcu_barrier_callback);
|
9 call_rcu(head, rcu_barrier_callback);
|
||||||
10 }
|
10 }
|
||||||
|
|
||||||
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
|
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
|
||||||
which contains the struct rcu_head that needed for the later call to
|
which contains the struct rcu_head that needed for the later call to
|
||||||
@@ -248,20 +259,25 @@ the current CPU's queue.
|
|||||||
|
|
||||||
The rcu_barrier_callback() function simply atomically decrements the
|
The rcu_barrier_callback() function simply atomically decrements the
|
||||||
rcu_barrier_cpu_count variable and finalizes the completion when it
|
rcu_barrier_cpu_count variable and finalizes the completion when it
|
||||||
reaches zero, as follows:
|
reaches zero, as follows::
|
||||||
|
|
||||||
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
||||||
2 {
|
2 {
|
||||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||||
4 complete(&rcu_barrier_completion);
|
4 complete(&rcu_barrier_completion);
|
||||||
5 }
|
5 }
|
||||||
|
|
||||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
.. _rcubarrier_quiz_2:
|
||||||
|
|
||||||
|
Quick Quiz #2:
|
||||||
|
What happens if CPU 0's rcu_barrier_func() executes
|
||||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||||
value one), but the other CPU's rcu_barrier_func() invocations
|
value one), but the other CPU's rcu_barrier_func() invocations
|
||||||
are delayed for a full grace period? Couldn't this result in
|
are delayed for a full grace period? Couldn't this result in
|
||||||
rcu_barrier() returning prematurely?
|
rcu_barrier() returning prematurely?
|
||||||
|
|
||||||
|
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
|
||||||
|
|
||||||
The current rcu_barrier() implementation is more complex, due to the need
|
The current rcu_barrier() implementation is more complex, due to the need
|
||||||
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
||||||
and the need to minimally disturb non-idle CPUs in real-time systems.
|
and the need to minimally disturb non-idle CPUs in real-time systems.
|
||||||
@@ -269,6 +285,7 @@ However, the code above illustrates the concepts.
|
|||||||
|
|
||||||
|
|
||||||
rcu_barrier() Summary
|
rcu_barrier() Summary
|
||||||
|
---------------------
|
||||||
|
|
||||||
The rcu_barrier() primitive has seen relatively little use, since most
|
The rcu_barrier() primitive has seen relatively little use, since most
|
||||||
code using RCU is in the core kernel rather than in modules. However, if
|
code using RCU is in the core kernel rather than in modules. However, if
|
||||||
@@ -277,8 +294,12 @@ so that your module may be safely unloaded.
|
|||||||
|
|
||||||
|
|
||||||
Answers to Quick Quizzes
|
Answers to Quick Quizzes
|
||||||
|
------------------------
|
||||||
|
|
||||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
.. _answer_rcubarrier_quiz_1:
|
||||||
|
|
||||||
|
Quick Quiz #1:
|
||||||
|
Is there any other situation where rcu_barrier() might
|
||||||
be required?
|
be required?
|
||||||
|
|
||||||
Answer: Interestingly enough, rcu_barrier() was not originally
|
Answer: Interestingly enough, rcu_barrier() was not originally
|
||||||
@@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally
|
|||||||
implementing rcutorture, and found that rcu_barrier() solves
|
implementing rcutorture, and found that rcu_barrier() solves
|
||||||
this problem as well.
|
this problem as well.
|
||||||
|
|
||||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
|
||||||
|
|
||||||
|
.. _answer_rcubarrier_quiz_2:
|
||||||
|
|
||||||
|
Quick Quiz #2:
|
||||||
|
What happens if CPU 0's rcu_barrier_func() executes
|
||||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||||
value one), but the other CPU's rcu_barrier_func() invocations
|
value one), but the other CPU's rcu_barrier_func() invocations
|
||||||
are delayed for a full grace period? Couldn't this result in
|
are delayed for a full grace period? Couldn't this result in
|
||||||
@@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
|
|||||||
is to add an rcu_read_lock() before line 8 of rcu_barrier()
|
is to add an rcu_read_lock() before line 8 of rcu_barrier()
|
||||||
and an rcu_read_unlock() after line 8 of this same function. If
|
and an rcu_read_unlock() after line 8 of this same function. If
|
||||||
you can think of a better change, please let me know!
|
you can think of a better change, please let me know!
|
||||||
|
|
||||||
|
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
|
@@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
|||||||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||||
for each CPU:
|
for each CPU:
|
||||||
|
|
||||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
|
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
|
||||||
|
|
||||||
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
||||||
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
||||||
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
||||||
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
|
rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
|
||||||
status, so that an "l" indicates that all callbacks were lazy at the start
|
processing is enabled.
|
||||||
of the last idle period and an "L" indicates that there are currently
|
|
||||||
no non-lazy callbacks (in both cases, "." is printed otherwise, as
|
|
||||||
shown above) and "D" indicates that dyntick-idle processing is enabled
|
|
||||||
("." is printed otherwise, for example, if disabled via the "nohz="
|
|
||||||
kernel boot parameter).
|
|
||||||
|
|
||||||
If the grace period ends just as the stall warning starts printing,
|
If the grace period ends just as the stall warning starts printing,
|
||||||
there will be a spurious stall-warning message, which will include
|
there will be a spurious stall-warning message, which will include
|
||||||
|
@@ -1,15 +1,18 @@
|
|||||||
|
.. _whatisrcu_doc:
|
||||||
|
|
||||||
What is RCU? -- "Read, Copy, Update"
|
What is RCU? -- "Read, Copy, Update"
|
||||||
|
======================================
|
||||||
|
|
||||||
Please note that the "What is RCU?" LWN series is an excellent place
|
Please note that the "What is RCU?" LWN series is an excellent place
|
||||||
to start learning about RCU:
|
to start learning about RCU:
|
||||||
|
|
||||||
1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||||
2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||||
3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||||
4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||||
2010 Big API Table http://lwn.net/Articles/419086/
|
| 2010 Big API Table http://lwn.net/Articles/419086/
|
||||||
5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||||
2014 Big API Table http://lwn.net/Articles/609973/
|
| 2014 Big API Table http://lwn.net/Articles/609973/
|
||||||
|
|
||||||
|
|
||||||
What is RCU?
|
What is RCU?
|
||||||
@@ -24,14 +27,21 @@ the experience has been that different people must take different paths
|
|||||||
to arrive at an understanding of RCU. This document provides several
|
to arrive at an understanding of RCU. This document provides several
|
||||||
different paths, as follows:
|
different paths, as follows:
|
||||||
|
|
||||||
1. RCU OVERVIEW
|
:ref:`1. RCU OVERVIEW <1_whatisRCU>`
|
||||||
2. WHAT IS RCU'S CORE API?
|
|
||||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>`
|
||||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
|
||||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
|
||||||
6. ANALOGY WITH READER-WRITER LOCKING
|
|
||||||
7. FULL LIST OF RCU APIs
|
:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
|
||||||
8. ANSWERS TO QUICK QUIZZES
|
|
||||||
|
:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
|
||||||
|
|
||||||
|
:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
|
||||||
|
|
||||||
|
:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
|
||||||
|
|
||||||
|
:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
|
||||||
|
|
||||||
People who prefer starting with a conceptual overview should focus on
|
People who prefer starting with a conceptual overview should focus on
|
||||||
Section 1, though most readers will profit by reading this section at
|
Section 1, though most readers will profit by reading this section at
|
||||||
@@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really
|
|||||||
that type of person, you have perused the source code and will therefore
|
that type of person, you have perused the source code and will therefore
|
||||||
never need this document anyway. ;-)
|
never need this document anyway. ;-)
|
||||||
|
|
||||||
|
.. _1_whatisRCU:
|
||||||
|
|
||||||
1. RCU OVERVIEW
|
1. RCU OVERVIEW
|
||||||
|
----------------
|
||||||
|
|
||||||
The basic idea behind RCU is to split updates into "removal" and
|
The basic idea behind RCU is to split updates into "removal" and
|
||||||
"reclamation" phases. The removal phase removes references to data items
|
"reclamation" phases. The removal phase removes references to data items
|
||||||
@@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given
|
|||||||
that readers are not doing any sort of synchronization operations???
|
that readers are not doing any sort of synchronization operations???
|
||||||
Read on to learn about how RCU's API makes this easy.
|
Read on to learn about how RCU's API makes this easy.
|
||||||
|
|
||||||
|
.. _2_whatisRCU:
|
||||||
|
|
||||||
2. WHAT IS RCU'S CORE API?
|
2. WHAT IS RCU'S CORE API?
|
||||||
|
---------------------------
|
||||||
|
|
||||||
The core RCU API is quite small:
|
The core RCU API is quite small:
|
||||||
|
|
||||||
@@ -136,7 +150,7 @@ later. See the kernel docbook documentation for more info, or look directly
|
|||||||
at the function header comments.
|
at the function header comments.
|
||||||
|
|
||||||
rcu_read_lock()
|
rcu_read_lock()
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
void rcu_read_lock(void);
|
void rcu_read_lock(void);
|
||||||
|
|
||||||
Used by a reader to inform the reclaimer that the reader is
|
Used by a reader to inform the reclaimer that the reader is
|
||||||
@@ -150,7 +164,7 @@ rcu_read_lock()
|
|||||||
longer-term references to data structures.
|
longer-term references to data structures.
|
||||||
|
|
||||||
rcu_read_unlock()
|
rcu_read_unlock()
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
void rcu_read_unlock(void);
|
void rcu_read_unlock(void);
|
||||||
|
|
||||||
Used by a reader to inform the reclaimer that the reader is
|
Used by a reader to inform the reclaimer that the reader is
|
||||||
@@ -158,15 +172,15 @@ rcu_read_unlock()
|
|||||||
read-side critical sections may be nested and/or overlapping.
|
read-side critical sections may be nested and/or overlapping.
|
||||||
|
|
||||||
synchronize_rcu()
|
synchronize_rcu()
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
void synchronize_rcu(void);
|
void synchronize_rcu(void);
|
||||||
|
|
||||||
Marks the end of updater code and the beginning of reclaimer
|
Marks the end of updater code and the beginning of reclaimer
|
||||||
code. It does this by blocking until all pre-existing RCU
|
code. It does this by blocking until all pre-existing RCU
|
||||||
read-side critical sections on all CPUs have completed.
|
read-side critical sections on all CPUs have completed.
|
||||||
Note that synchronize_rcu() will -not- necessarily wait for
|
Note that synchronize_rcu() will **not** necessarily wait for
|
||||||
any subsequent RCU read-side critical sections to complete.
|
any subsequent RCU read-side critical sections to complete.
|
||||||
For example, consider the following sequence of events:
|
For example, consider the following sequence of events::
|
||||||
|
|
||||||
CPU 0 CPU 1 CPU 2
|
CPU 0 CPU 1 CPU 2
|
||||||
----------------- ------------------------- ---------------
|
----------------- ------------------------- ---------------
|
||||||
@@ -182,7 +196,7 @@ synchronize_rcu()
|
|||||||
any that begin after synchronize_rcu() is invoked.
|
any that begin after synchronize_rcu() is invoked.
|
||||||
|
|
||||||
Of course, synchronize_rcu() does not necessarily return
|
Of course, synchronize_rcu() does not necessarily return
|
||||||
-immediately- after the last pre-existing RCU read-side critical
|
**immediately** after the last pre-existing RCU read-side critical
|
||||||
section completes. For one thing, there might well be scheduling
|
section completes. For one thing, there might well be scheduling
|
||||||
delays. For another thing, many RCU implementations process
|
delays. For another thing, many RCU implementations process
|
||||||
requests in batches in order to improve efficiencies, which can
|
requests in batches in order to improve efficiencies, which can
|
||||||
@@ -211,10 +225,10 @@ synchronize_rcu()
|
|||||||
checklist.txt for some approaches to limiting the update rate.
|
checklist.txt for some approaches to limiting the update rate.
|
||||||
|
|
||||||
rcu_assign_pointer()
|
rcu_assign_pointer()
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
void rcu_assign_pointer(p, typeof(p) v);
|
void rcu_assign_pointer(p, typeof(p) v);
|
||||||
|
|
||||||
Yes, rcu_assign_pointer() -is- implemented as a macro, though it
|
Yes, rcu_assign_pointer() **is** implemented as a macro, though it
|
||||||
would be cool to be able to declare a function in this manner.
|
would be cool to be able to declare a function in this manner.
|
||||||
(Compiler experts will no doubt disagree.)
|
(Compiler experts will no doubt disagree.)
|
||||||
|
|
||||||
@@ -231,7 +245,7 @@ rcu_assign_pointer()
|
|||||||
the _rcu list-manipulation primitives such as list_add_rcu().
|
the _rcu list-manipulation primitives such as list_add_rcu().
|
||||||
|
|
||||||
rcu_dereference()
|
rcu_dereference()
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
typeof(p) rcu_dereference(p);
|
typeof(p) rcu_dereference(p);
|
||||||
|
|
||||||
Like rcu_assign_pointer(), rcu_dereference() must be implemented
|
Like rcu_assign_pointer(), rcu_dereference() must be implemented
|
||||||
@@ -248,13 +262,13 @@ rcu_dereference()
|
|||||||
|
|
||||||
Common coding practice uses rcu_dereference() to copy an
|
Common coding practice uses rcu_dereference() to copy an
|
||||||
RCU-protected pointer to a local variable, then dereferences
|
RCU-protected pointer to a local variable, then dereferences
|
||||||
this local variable, for example as follows:
|
this local variable, for example as follows::
|
||||||
|
|
||||||
p = rcu_dereference(head.next);
|
p = rcu_dereference(head.next);
|
||||||
return p->data;
|
return p->data;
|
||||||
|
|
||||||
However, in this case, one could just as easily combine these
|
However, in this case, one could just as easily combine these
|
||||||
into one statement:
|
into one statement::
|
||||||
|
|
||||||
return rcu_dereference(head.next)->data;
|
return rcu_dereference(head.next)->data;
|
||||||
|
|
||||||
@@ -266,8 +280,8 @@ rcu_dereference()
|
|||||||
unnecessary overhead on Alpha CPUs.
|
unnecessary overhead on Alpha CPUs.
|
||||||
|
|
||||||
Note that the value returned by rcu_dereference() is valid
|
Note that the value returned by rcu_dereference() is valid
|
||||||
only within the enclosing RCU read-side critical section [1].
|
only within the enclosing RCU read-side critical section [1]_.
|
||||||
For example, the following is -not- legal:
|
For example, the following is **not** legal::
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
p = rcu_dereference(head.next);
|
p = rcu_dereference(head.next);
|
||||||
@@ -290,9 +304,9 @@ rcu_dereference()
|
|||||||
at any time, including immediately after the rcu_dereference().
|
at any time, including immediately after the rcu_dereference().
|
||||||
And, again like rcu_assign_pointer(), rcu_dereference() is
|
And, again like rcu_assign_pointer(), rcu_dereference() is
|
||||||
typically used indirectly, via the _rcu list-manipulation
|
typically used indirectly, via the _rcu list-manipulation
|
||||||
primitives, such as list_for_each_entry_rcu() [2].
|
primitives, such as list_for_each_entry_rcu() [2]_.
|
||||||
|
|
||||||
[1] The variant rcu_dereference_protected() can be used outside
|
.. [1] The variant rcu_dereference_protected() can be used outside
|
||||||
of an RCU read-side critical section as long as the usage is
|
of an RCU read-side critical section as long as the usage is
|
||||||
protected by locks acquired by the update-side code. This variant
|
protected by locks acquired by the update-side code. This variant
|
||||||
avoids the lockdep warning that would happen when using (for
|
avoids the lockdep warning that would happen when using (for
|
||||||
@@ -305,7 +319,7 @@ rcu_dereference()
|
|||||||
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
|
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
|
||||||
and the API's code comments for more details and example usage.
|
and the API's code comments for more details and example usage.
|
||||||
|
|
||||||
[2] If the list_for_each_entry_rcu() instance might be used by
|
.. [2] If the list_for_each_entry_rcu() instance might be used by
|
||||||
update-side code as well as by RCU readers, then an additional
|
update-side code as well as by RCU readers, then an additional
|
||||||
lockdep expression can be added to its list of arguments.
|
lockdep expression can be added to its list of arguments.
|
||||||
For example, given an additional "lock_is_held(&mylock)" argument,
|
For example, given an additional "lock_is_held(&mylock)" argument,
|
||||||
@@ -315,6 +329,7 @@ rcu_dereference()
|
|||||||
|
|
||||||
The following diagram shows how each API communicates among the
|
The following diagram shows how each API communicates among the
|
||||||
reader, updater, and reclaimer.
|
reader, updater, and reclaimer.
|
||||||
|
::
|
||||||
|
|
||||||
|
|
||||||
rcu_assign_pointer()
|
rcu_assign_pointer()
|
||||||
@@ -375,12 +390,16 @@ c. RCU applied to scheduler and interrupt/NMI-handler tasks.
|
|||||||
Again, most uses will be of (a). The (b) and (c) cases are important
|
Again, most uses will be of (a). The (b) and (c) cases are important
|
||||||
for specialized uses, but are relatively uncommon.
|
for specialized uses, but are relatively uncommon.
|
||||||
|
|
||||||
|
.. _3_whatisRCU:
|
||||||
|
|
||||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
This section shows a simple use of the core RCU API to protect a
|
This section shows a simple use of the core RCU API to protect a
|
||||||
global pointer to a dynamically allocated structure. More-typical
|
global pointer to a dynamically allocated structure. More-typical
|
||||||
uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
|
||||||
|
:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
|
||||||
|
::
|
||||||
|
|
||||||
struct foo {
|
struct foo {
|
||||||
int a;
|
int a;
|
||||||
@@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
|||||||
|
|
||||||
So, to sum up:
|
So, to sum up:
|
||||||
|
|
||||||
o Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
- Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
||||||
read-side critical sections.
|
read-side critical sections.
|
||||||
|
|
||||||
o Within an RCU read-side critical section, use rcu_dereference()
|
- Within an RCU read-side critical section, use rcu_dereference()
|
||||||
to dereference RCU-protected pointers.
|
to dereference RCU-protected pointers.
|
||||||
|
|
||||||
o Use some solid scheme (such as locks or semaphores) to
|
- Use some solid scheme (such as locks or semaphores) to
|
||||||
keep concurrent updates from interfering with each other.
|
keep concurrent updates from interfering with each other.
|
||||||
|
|
||||||
o Use rcu_assign_pointer() to update an RCU-protected pointer.
|
- Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||||
This primitive protects concurrent readers from the updater,
|
This primitive protects concurrent readers from the updater,
|
||||||
-not- concurrent updates from each other! You therefore still
|
**not** concurrent updates from each other! You therefore still
|
||||||
need to use locking (or something similar) to keep concurrent
|
need to use locking (or something similar) to keep concurrent
|
||||||
rcu_assign_pointer() primitives from interfering with each other.
|
rcu_assign_pointer() primitives from interfering with each other.
|
||||||
|
|
||||||
o Use synchronize_rcu() -after- removing a data element from an
|
- Use synchronize_rcu() **after** removing a data element from an
|
||||||
RCU-protected data structure, but -before- reclaiming/freeing
|
RCU-protected data structure, but **before** reclaiming/freeing
|
||||||
the data element, in order to wait for the completion of all
|
the data element, in order to wait for the completion of all
|
||||||
RCU read-side critical sections that might be referencing that
|
RCU read-side critical sections that might be referencing that
|
||||||
data item.
|
data item.
|
||||||
|
|
||||||
See checklist.txt for additional rules to follow when using RCU.
|
See checklist.txt for additional rules to follow when using RCU.
|
||||||
And again, more-typical uses of RCU may be found in listRCU.txt,
|
And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
|
||||||
arrayRCU.txt, and NMI-RCU.txt.
|
<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
|
||||||
|
<NMI_rcu_doc>`.
|
||||||
|
|
||||||
|
.. _4_whatisRCU:
|
||||||
|
|
||||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
||||||
|
--------------------------------------------
|
||||||
|
|
||||||
In the example above, foo_update_a() blocks until a grace period elapses.
|
In the example above, foo_update_a() blocks until a grace period elapses.
|
||||||
This is quite simple, but in some cases one cannot afford to wait so
|
This is quite simple, but in some cases one cannot afford to wait so
|
||||||
long -- there might be other high-priority work to be done.
|
long -- there might be other high-priority work to be done.
|
||||||
|
|
||||||
In such cases, one uses call_rcu() rather than synchronize_rcu().
|
In such cases, one uses call_rcu() rather than synchronize_rcu().
|
||||||
The call_rcu() API is as follows:
|
The call_rcu() API is as follows::
|
||||||
|
|
||||||
void call_rcu(struct rcu_head * head,
|
void call_rcu(struct rcu_head * head,
|
||||||
void (*func)(struct rcu_head *head));
|
void (*func)(struct rcu_head *head));
|
||||||
@@ -481,7 +503,7 @@ The call_rcu() API is as follows:
|
|||||||
This function invokes func(head) after a grace period has elapsed.
|
This function invokes func(head) after a grace period has elapsed.
|
||||||
This invocation might happen from either softirq or process context,
|
This invocation might happen from either softirq or process context,
|
||||||
so the function is not permitted to block. The foo struct needs to
|
so the function is not permitted to block. The foo struct needs to
|
||||||
have an rcu_head structure added, perhaps as follows:
|
have an rcu_head structure added, perhaps as follows::
|
||||||
|
|
||||||
struct foo {
|
struct foo {
|
||||||
int a;
|
int a;
|
||||||
@@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows:
|
|||||||
struct rcu_head rcu;
|
struct rcu_head rcu;
|
||||||
};
|
};
|
||||||
|
|
||||||
The foo_update_a() function might then be written as follows:
|
The foo_update_a() function might then be written as follows::
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create a new struct foo that is the same as the one currently
|
* Create a new struct foo that is the same as the one currently
|
||||||
@@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows:
|
|||||||
call_rcu(&old_fp->rcu, foo_reclaim);
|
call_rcu(&old_fp->rcu, foo_reclaim);
|
||||||
}
|
}
|
||||||
|
|
||||||
The foo_reclaim() function might appear as follows:
|
The foo_reclaim() function might appear as follows::
|
||||||
|
|
||||||
void foo_reclaim(struct rcu_head *rp)
|
void foo_reclaim(struct rcu_head *rp)
|
||||||
{
|
{
|
||||||
@@ -544,7 +566,7 @@ namely foo_reclaim().
|
|||||||
The summary of advice is the same as for the previous section, except
|
The summary of advice is the same as for the previous section, except
|
||||||
that we are now using call_rcu() rather than synchronize_rcu():
|
that we are now using call_rcu() rather than synchronize_rcu():
|
||||||
|
|
||||||
o Use call_rcu() -after- removing a data element from an
|
- Use call_rcu() **after** removing a data element from an
|
||||||
RCU-protected data structure in order to register a callback
|
RCU-protected data structure in order to register a callback
|
||||||
function that will be invoked after the completion of all RCU
|
function that will be invoked after the completion of all RCU
|
||||||
read-side critical sections that might be referencing that
|
read-side critical sections that might be referencing that
|
||||||
@@ -552,14 +574,16 @@ o Use call_rcu() -after- removing a data element from an
|
|||||||
|
|
||||||
If the callback for call_rcu() is not doing anything more than calling
|
If the callback for call_rcu() is not doing anything more than calling
|
||||||
kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
|
kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
|
||||||
to avoid having to write your own callback:
|
to avoid having to write your own callback::
|
||||||
|
|
||||||
kfree_rcu(old_fp, rcu);
|
kfree_rcu(old_fp, rcu);
|
||||||
|
|
||||||
Again, see checklist.txt for additional rules governing the use of RCU.
|
Again, see checklist.txt for additional rules governing the use of RCU.
|
||||||
|
|
||||||
|
.. _5_whatisRCU:
|
||||||
|
|
||||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
||||||
|
------------------------------------------------
|
||||||
|
|
||||||
One of the nice things about RCU is that it has extremely simple "toy"
|
One of the nice things about RCU is that it has extremely simple "toy"
|
||||||
implementations that are a good first step towards understanding the
|
implementations that are a good first step towards understanding the
|
||||||
@@ -579,7 +603,7 @@ more details on the current implementation as of early 2004.
|
|||||||
|
|
||||||
|
|
||||||
5A. "TOY" IMPLEMENTATION #1: LOCKING
|
5A. "TOY" IMPLEMENTATION #1: LOCKING
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
This section presents a "toy" RCU implementation that is based on
|
This section presents a "toy" RCU implementation that is based on
|
||||||
familiar locking primitives. Its overhead makes it a non-starter for
|
familiar locking primitives. Its overhead makes it a non-starter for
|
||||||
real-life use, as does its lack of scalability. It is also unsuitable
|
real-life use, as does its lack of scalability. It is also unsuitable
|
||||||
@@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock.
|
|||||||
However, it is probably the easiest implementation to relate to, so is
|
However, it is probably the easiest implementation to relate to, so is
|
||||||
a good starting point.
|
a good starting point.
|
||||||
|
|
||||||
It is extremely simple:
|
It is extremely simple::
|
||||||
|
|
||||||
static DEFINE_RWLOCK(rcu_gp_mutex);
|
static DEFINE_RWLOCK(rcu_gp_mutex);
|
||||||
|
|
||||||
@@ -614,7 +638,7 @@ It is extremely simple:
|
|||||||
|
|
||||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
||||||
much. But here are simplified versions anyway. And whatever you do,
|
much. But here are simplified versions anyway. And whatever you do,
|
||||||
don't forget about them when submitting patches making use of RCU!]
|
don't forget about them when submitting patches making use of RCU!]::
|
||||||
|
|
||||||
#define rcu_assign_pointer(p, v) \
|
#define rcu_assign_pointer(p, v) \
|
||||||
({ \
|
({ \
|
||||||
@@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu().
|
|||||||
But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
|
But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
|
||||||
so there can be no deadlock cycle.
|
so there can be no deadlock cycle.
|
||||||
|
|
||||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
.. _quiz_1:
|
||||||
|
|
||||||
|
Quick Quiz #1:
|
||||||
|
Why is this argument naive? How could a deadlock
|
||||||
occur when using this algorithm in a real-world Linux
|
occur when using this algorithm in a real-world Linux
|
||||||
kernel? How could this deadlock be avoided?
|
kernel? How could this deadlock be avoided?
|
||||||
|
|
||||||
|
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||||
|
|
||||||
5B. "TOY" EXAMPLE #2: CLASSIC RCU
|
5B. "TOY" EXAMPLE #2: CLASSIC RCU
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
This section presents a "toy" RCU implementation that is based on
|
This section presents a "toy" RCU implementation that is based on
|
||||||
"classic RCU". It is also short on performance (but only for updates) and
|
"classic RCU". It is also short on performance (but only for updates) and
|
||||||
on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
|
on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
|
||||||
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
|
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
|
||||||
are the same as those shown in the preceding section, so they are omitted.
|
are the same as those shown in the preceding section, so they are omitted.
|
||||||
|
::
|
||||||
|
|
||||||
void rcu_read_lock(void) { }
|
void rcu_read_lock(void) { }
|
||||||
|
|
||||||
@@ -683,14 +712,14 @@ CPU in turn. The run_on() primitive can be implemented straightforwardly
|
|||||||
in terms of the sched_setaffinity() primitive. Of course, a somewhat less
|
in terms of the sched_setaffinity() primitive. Of course, a somewhat less
|
||||||
"toy" implementation would restore the affinity upon completion rather
|
"toy" implementation would restore the affinity upon completion rather
|
||||||
than just leaving all tasks running on the last CPU, but when I said
|
than just leaving all tasks running on the last CPU, but when I said
|
||||||
"toy", I meant -toy-!
|
"toy", I meant **toy**!
|
||||||
|
|
||||||
So how the heck is this supposed to work???
|
So how the heck is this supposed to work???
|
||||||
|
|
||||||
Remember that it is illegal to block while in an RCU read-side critical
|
Remember that it is illegal to block while in an RCU read-side critical
|
||||||
section. Therefore, if a given CPU executes a context switch, we know
|
section. Therefore, if a given CPU executes a context switch, we know
|
||||||
that it must have completed all preceding RCU read-side critical sections.
|
that it must have completed all preceding RCU read-side critical sections.
|
||||||
Once -all- CPUs have executed a context switch, then -all- preceding
|
Once **all** CPUs have executed a context switch, then **all** preceding
|
||||||
RCU read-side critical sections will have completed.
|
RCU read-side critical sections will have completed.
|
||||||
|
|
||||||
So, suppose that we remove a data item from its structure and then invoke
|
So, suppose that we remove a data item from its structure and then invoke
|
||||||
@@ -698,19 +727,32 @@ synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
|
|||||||
that there are no RCU read-side critical sections holding a reference
|
that there are no RCU read-side critical sections holding a reference
|
||||||
to that data item, so we can safely reclaim it.
|
to that data item, so we can safely reclaim it.
|
||||||
|
|
||||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
.. _quiz_2:
|
||||||
overhead is -negative-.
|
|
||||||
|
|
||||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
Quick Quiz #2:
|
||||||
|
Give an example where Classic RCU's read-side
|
||||||
|
overhead is **negative**.
|
||||||
|
|
||||||
|
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||||
|
|
||||||
|
.. _quiz_3:
|
||||||
|
|
||||||
|
Quick Quiz #3:
|
||||||
|
If it is illegal to block in an RCU read-side
|
||||||
critical section, what the heck do you do in
|
critical section, what the heck do you do in
|
||||||
PREEMPT_RT, where normal spinlocks can block???
|
PREEMPT_RT, where normal spinlocks can block???
|
||||||
|
|
||||||
|
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||||
|
|
||||||
|
.. _6_whatisRCU:
|
||||||
|
|
||||||
6. ANALOGY WITH READER-WRITER LOCKING
|
6. ANALOGY WITH READER-WRITER LOCKING
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
Although RCU can be used in many different ways, a very common use of
|
Although RCU can be used in many different ways, a very common use of
|
||||||
RCU is analogous to reader-writer locking. The following unified
|
RCU is analogous to reader-writer locking. The following unified
|
||||||
diff shows how closely related RCU and reader-writer locking can be.
|
diff shows how closely related RCU and reader-writer locking can be.
|
||||||
|
::
|
||||||
|
|
||||||
@@ -5,5 +5,5 @@ struct el {
|
@@ -5,5 +5,5 @@ struct el {
|
||||||
int data;
|
int data;
|
||||||
@@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be.
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Or, for those who prefer a side-by-side listing:
|
Or, for those who prefer a side-by-side listing::
|
||||||
|
|
||||||
1 struct el { 1 struct el {
|
1 struct el { 1 struct el {
|
||||||
2 struct list_head list; 2 struct list_head list;
|
2 struct list_head list; 2 struct list_head list;
|
||||||
@@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing:
|
|||||||
8 rwlock_t listmutex; 8 spinlock_t listmutex;
|
8 rwlock_t listmutex; 8 spinlock_t listmutex;
|
||||||
9 struct el head; 9 struct el head;
|
9 struct el head; 9 struct el head;
|
||||||
|
|
||||||
1 int search(long key, int *result) 1 int search(long key, int *result)
|
::
|
||||||
2 { 2 {
|
|
||||||
3 struct list_head *lp; 3 struct list_head *lp;
|
|
||||||
4 struct el *p; 4 struct el *p;
|
|
||||||
5 5
|
|
||||||
6 read_lock(&listmutex); 6 rcu_read_lock();
|
|
||||||
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
|
||||||
8 if (p->key == key) { 8 if (p->key == key) {
|
|
||||||
9 *result = p->data; 9 *result = p->data;
|
|
||||||
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
|
||||||
11 return 1; 11 return 1;
|
|
||||||
12 } 12 }
|
|
||||||
13 } 13 }
|
|
||||||
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
|
||||||
15 return 0; 15 return 0;
|
|
||||||
16 } 16 }
|
|
||||||
|
|
||||||
1 int delete(long key) 1 int delete(long key)
|
1 int search(long key, int *result) 1 int search(long key, int *result)
|
||||||
2 { 2 {
|
2 { 2 {
|
||||||
3 struct el *p; 3 struct el *p;
|
3 struct list_head *lp; 3 struct list_head *lp;
|
||||||
4 4
|
4 struct el *p; 4 struct el *p;
|
||||||
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
5 5
|
||||||
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
6 read_lock(&listmutex); 6 rcu_read_lock();
|
||||||
7 if (p->key == key) { 7 if (p->key == key) {
|
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
||||||
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
8 if (p->key == key) { 8 if (p->key == key) {
|
||||||
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
9 *result = p->data; 9 *result = p->data;
|
||||||
10 synchronize_rcu();
|
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
||||||
10 kfree(p); 11 kfree(p);
|
11 return 1; 11 return 1;
|
||||||
11 return 1; 12 return 1;
|
12 } 12 }
|
||||||
12 } 13 }
|
13 } 13 }
|
||||||
13 } 14 }
|
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
||||||
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
15 return 0; 15 return 0;
|
||||||
15 return 0; 16 return 0;
|
16 } 16 }
|
||||||
16 } 17 }
|
|
||||||
|
::
|
||||||
|
|
||||||
|
1 int delete(long key) 1 int delete(long key)
|
||||||
|
2 { 2 {
|
||||||
|
3 struct el *p; 3 struct el *p;
|
||||||
|
4 4
|
||||||
|
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
||||||
|
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
||||||
|
7 if (p->key == key) { 7 if (p->key == key) {
|
||||||
|
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
||||||
|
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
||||||
|
10 synchronize_rcu();
|
||||||
|
10 kfree(p); 11 kfree(p);
|
||||||
|
11 return 1; 12 return 1;
|
||||||
|
12 } 13 }
|
||||||
|
13 } 14 }
|
||||||
|
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
||||||
|
15 return 0; 16 return 0;
|
||||||
|
16 } 17 }
|
||||||
|
|
||||||
Either way, the differences are quite small. Read-side locking moves
|
Either way, the differences are quite small. Read-side locking moves
|
||||||
to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
|
to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
|
||||||
@@ -825,22 +871,27 @@ delete() can now block. If this is a problem, there is a callback-based
|
|||||||
mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
|
mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
|
||||||
be used in place of synchronize_rcu().
|
be used in place of synchronize_rcu().
|
||||||
|
|
||||||
|
.. _7_whatisRCU:
|
||||||
|
|
||||||
7. FULL LIST OF RCU APIs
|
7. FULL LIST OF RCU APIs
|
||||||
|
-------------------------
|
||||||
|
|
||||||
The RCU APIs are documented in docbook-format header comments in the
|
The RCU APIs are documented in docbook-format header comments in the
|
||||||
Linux-kernel source code, but it helps to have a full list of the
|
Linux-kernel source code, but it helps to have a full list of the
|
||||||
APIs, since there does not appear to be a way to categorize them
|
APIs, since there does not appear to be a way to categorize them
|
||||||
in docbook. Here is the list, by category.
|
in docbook. Here is the list, by category.
|
||||||
|
|
||||||
RCU list traversal:
|
RCU list traversal::
|
||||||
|
|
||||||
list_entry_rcu
|
list_entry_rcu
|
||||||
|
list_entry_lockless
|
||||||
list_first_entry_rcu
|
list_first_entry_rcu
|
||||||
list_next_rcu
|
list_next_rcu
|
||||||
list_for_each_entry_rcu
|
list_for_each_entry_rcu
|
||||||
list_for_each_entry_continue_rcu
|
list_for_each_entry_continue_rcu
|
||||||
list_for_each_entry_from_rcu
|
list_for_each_entry_from_rcu
|
||||||
|
list_first_or_null_rcu
|
||||||
|
list_next_or_null_rcu
|
||||||
hlist_first_rcu
|
hlist_first_rcu
|
||||||
hlist_next_rcu
|
hlist_next_rcu
|
||||||
hlist_pprev_rcu
|
hlist_pprev_rcu
|
||||||
@@ -854,7 +905,7 @@ RCU list traversal:
|
|||||||
hlist_bl_first_rcu
|
hlist_bl_first_rcu
|
||||||
hlist_bl_for_each_entry_rcu
|
hlist_bl_for_each_entry_rcu
|
||||||
|
|
||||||
RCU pointer/list update:
|
RCU pointer/list update::
|
||||||
|
|
||||||
rcu_assign_pointer
|
rcu_assign_pointer
|
||||||
list_add_rcu
|
list_add_rcu
|
||||||
@@ -864,10 +915,12 @@ RCU pointer/list update:
|
|||||||
hlist_add_behind_rcu
|
hlist_add_behind_rcu
|
||||||
hlist_add_before_rcu
|
hlist_add_before_rcu
|
||||||
hlist_add_head_rcu
|
hlist_add_head_rcu
|
||||||
|
hlist_add_tail_rcu
|
||||||
hlist_del_rcu
|
hlist_del_rcu
|
||||||
hlist_del_init_rcu
|
hlist_del_init_rcu
|
||||||
hlist_replace_rcu
|
hlist_replace_rcu
|
||||||
list_splice_init_rcu()
|
list_splice_init_rcu
|
||||||
|
list_splice_tail_init_rcu
|
||||||
hlist_nulls_del_init_rcu
|
hlist_nulls_del_init_rcu
|
||||||
hlist_nulls_del_rcu
|
hlist_nulls_del_rcu
|
||||||
hlist_nulls_add_head_rcu
|
hlist_nulls_add_head_rcu
|
||||||
@@ -876,7 +929,9 @@ RCU pointer/list update:
|
|||||||
hlist_bl_del_rcu
|
hlist_bl_del_rcu
|
||||||
hlist_bl_set_first_rcu
|
hlist_bl_set_first_rcu
|
||||||
|
|
||||||
RCU: Critical sections Grace period Barrier
|
RCU::
|
||||||
|
|
||||||
|
Critical sections Grace period Barrier
|
||||||
|
|
||||||
rcu_read_lock synchronize_net rcu_barrier
|
rcu_read_lock synchronize_net rcu_barrier
|
||||||
rcu_read_unlock synchronize_rcu
|
rcu_read_unlock synchronize_rcu
|
||||||
@@ -885,7 +940,9 @@ RCU: Critical sections Grace period Barrier
|
|||||||
rcu_dereference_check kfree_rcu
|
rcu_dereference_check kfree_rcu
|
||||||
rcu_dereference_protected
|
rcu_dereference_protected
|
||||||
|
|
||||||
bh: Critical sections Grace period Barrier
|
bh::
|
||||||
|
|
||||||
|
Critical sections Grace period Barrier
|
||||||
|
|
||||||
rcu_read_lock_bh call_rcu rcu_barrier
|
rcu_read_lock_bh call_rcu rcu_barrier
|
||||||
rcu_read_unlock_bh synchronize_rcu
|
rcu_read_unlock_bh synchronize_rcu
|
||||||
@@ -896,7 +953,9 @@ bh: Critical sections Grace period Barrier
|
|||||||
rcu_dereference_bh_protected
|
rcu_dereference_bh_protected
|
||||||
rcu_read_lock_bh_held
|
rcu_read_lock_bh_held
|
||||||
|
|
||||||
sched: Critical sections Grace period Barrier
|
sched::
|
||||||
|
|
||||||
|
Critical sections Grace period Barrier
|
||||||
|
|
||||||
rcu_read_lock_sched call_rcu rcu_barrier
|
rcu_read_lock_sched call_rcu rcu_barrier
|
||||||
rcu_read_unlock_sched synchronize_rcu
|
rcu_read_unlock_sched synchronize_rcu
|
||||||
@@ -910,7 +969,9 @@ sched: Critical sections Grace period Barrier
|
|||||||
rcu_read_lock_sched_held
|
rcu_read_lock_sched_held
|
||||||
|
|
||||||
|
|
||||||
SRCU: Critical sections Grace period Barrier
|
SRCU::
|
||||||
|
|
||||||
|
Critical sections Grace period Barrier
|
||||||
|
|
||||||
srcu_read_lock call_srcu srcu_barrier
|
srcu_read_lock call_srcu srcu_barrier
|
||||||
srcu_read_unlock synchronize_srcu
|
srcu_read_unlock synchronize_srcu
|
||||||
@@ -918,13 +979,14 @@ SRCU: Critical sections Grace period Barrier
|
|||||||
srcu_dereference_check
|
srcu_dereference_check
|
||||||
srcu_read_lock_held
|
srcu_read_lock_held
|
||||||
|
|
||||||
SRCU: Initialization/cleanup
|
SRCU: Initialization/cleanup::
|
||||||
|
|
||||||
DEFINE_SRCU
|
DEFINE_SRCU
|
||||||
DEFINE_STATIC_SRCU
|
DEFINE_STATIC_SRCU
|
||||||
init_srcu_struct
|
init_srcu_struct
|
||||||
cleanup_srcu_struct
|
cleanup_srcu_struct
|
||||||
|
|
||||||
All: lockdep-checked RCU-protected pointer access
|
All: lockdep-checked RCU-protected pointer access::
|
||||||
|
|
||||||
rcu_access_pointer
|
rcu_access_pointer
|
||||||
rcu_dereference_raw
|
rcu_dereference_raw
|
||||||
@@ -974,15 +1036,19 @@ g. Otherwise, use RCU.
|
|||||||
Of course, this all assumes that you have determined that RCU is in fact
|
Of course, this all assumes that you have determined that RCU is in fact
|
||||||
the right tool for your job.
|
the right tool for your job.
|
||||||
|
|
||||||
|
.. _8_whatisRCU:
|
||||||
|
|
||||||
8. ANSWERS TO QUICK QUIZZES
|
8. ANSWERS TO QUICK QUIZZES
|
||||||
|
----------------------------
|
||||||
|
|
||||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
Quick Quiz #1:
|
||||||
|
Why is this argument naive? How could a deadlock
|
||||||
occur when using this algorithm in a real-world Linux
|
occur when using this algorithm in a real-world Linux
|
||||||
kernel? [Referring to the lock-based "toy" RCU
|
kernel? [Referring to the lock-based "toy" RCU
|
||||||
algorithm.]
|
algorithm.]
|
||||||
|
|
||||||
Answer: Consider the following sequence of events:
|
Answer:
|
||||||
|
Consider the following sequence of events:
|
||||||
|
|
||||||
1. CPU 0 acquires some unrelated lock, call it
|
1. CPU 0 acquires some unrelated lock, call it
|
||||||
"problematic_lock", disabling irq via
|
"problematic_lock", disabling irq via
|
||||||
@@ -1021,10 +1087,14 @@ Answer: Consider the following sequence of events:
|
|||||||
approach where tasks in RCU read-side critical sections
|
approach where tasks in RCU read-side critical sections
|
||||||
cannot be blocked by tasks executing synchronize_rcu().
|
cannot be blocked by tasks executing synchronize_rcu().
|
||||||
|
|
||||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
:ref:`Back to Quick Quiz #1 <quiz_1>`
|
||||||
overhead is -negative-.
|
|
||||||
|
|
||||||
Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
Quick Quiz #2:
|
||||||
|
Give an example where Classic RCU's read-side
|
||||||
|
overhead is **negative**.
|
||||||
|
|
||||||
|
Answer:
|
||||||
|
Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||||
kernel where a routing table is used by process-context
|
kernel where a routing table is used by process-context
|
||||||
code, but can be updated by irq-context code (for example,
|
code, but can be updated by irq-context code (for example,
|
||||||
by an "ICMP REDIRECT" packet). The usual way of handling
|
by an "ICMP REDIRECT" packet). The usual way of handling
|
||||||
@@ -1046,11 +1116,15 @@ Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
|||||||
even the theoretical possibility of negative overhead for
|
even the theoretical possibility of negative overhead for
|
||||||
a synchronization primitive is a bit unexpected. ;-)
|
a synchronization primitive is a bit unexpected. ;-)
|
||||||
|
|
||||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
:ref:`Back to Quick Quiz #2 <quiz_2>`
|
||||||
|
|
||||||
|
Quick Quiz #3:
|
||||||
|
If it is illegal to block in an RCU read-side
|
||||||
critical section, what the heck do you do in
|
critical section, what the heck do you do in
|
||||||
PREEMPT_RT, where normal spinlocks can block???
|
PREEMPT_RT, where normal spinlocks can block???
|
||||||
|
|
||||||
Answer: Just as PREEMPT_RT permits preemption of spinlock
|
Answer:
|
||||||
|
Just as PREEMPT_RT permits preemption of spinlock
|
||||||
critical sections, it permits preemption of RCU
|
critical sections, it permits preemption of RCU
|
||||||
read-side critical sections. It also permits
|
read-side critical sections. It also permits
|
||||||
spinlocks blocking while in RCU read-side critical
|
spinlocks blocking while in RCU read-side critical
|
||||||
@@ -1069,6 +1143,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
|
|||||||
Besides, how does the computer know what pizza parlor
|
Besides, how does the computer know what pizza parlor
|
||||||
the human being went to???
|
the human being went to???
|
||||||
|
|
||||||
|
:ref:`Back to Quick Quiz #3 <quiz_3>`
|
||||||
|
|
||||||
ACKNOWLEDGEMENTS
|
ACKNOWLEDGEMENTS
|
||||||
|
|
62
Documentation/admin-guide/acpi/fan_performance_states.rst
Normal file
62
Documentation/admin-guide/acpi/fan_performance_states.rst
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===========================
|
||||||
|
ACPI Fan Performance States
|
||||||
|
===========================
|
||||||
|
|
||||||
|
When the optional _FPS object is present under an ACPI device representing a
|
||||||
|
fan (for example, PNP0C0B or INT3404), the ACPI fan driver creates additional
|
||||||
|
"state*" attributes in the sysfs directory of the ACPI device in question.
|
||||||
|
These attributes list properties of fan performance states.
|
||||||
|
|
||||||
|
For more information on _FPS refer to the ACPI specification at:
|
||||||
|
|
||||||
|
http://uefi.org/specifications
|
||||||
|
|
||||||
|
For instance, the contents of the INT3404 ACPI device sysfs directory
|
||||||
|
may look as follows::
|
||||||
|
|
||||||
|
$ ls -l /sys/bus/acpi/devices/INT3404:00/
|
||||||
|
total 0
|
||||||
|
...
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state0
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state1
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state10
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state11
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state2
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state3
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state4
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state5
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state6
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state7
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state8
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state9
|
||||||
|
-r--r--r-- 1 root root 4096 Dec 13 01:00 status
|
||||||
|
...
|
||||||
|
|
||||||
|
where each of the "state*" files represents one performance state of the fan
|
||||||
|
and contains a colon-separated list of 5 integer numbers (fields) with the
|
||||||
|
following interpretation::
|
||||||
|
|
||||||
|
control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
|
||||||
|
|
||||||
|
* ``control_percent``: The percent value to be used to set the fan speed to a
|
||||||
|
specific level using the _FSL object (0-100).
|
||||||
|
|
||||||
|
* ``trip_point_index``: The active cooling trip point number that corresponds
|
||||||
|
to this performance state (0-9).
|
||||||
|
|
||||||
|
* ``speed_rpm``: Speed of the fan in rotations per minute.
|
||||||
|
|
||||||
|
* ``noise_level_mdb``: Audible noise emitted by the fan in this state in
|
||||||
|
millidecibels.
|
||||||
|
|
||||||
|
* ``power_mw``: Power draw of the fan in this state in milliwatts.
|
||||||
|
|
||||||
|
For example::
|
||||||
|
|
||||||
|
$cat /sys/bus/acpi/devices/INT3404:00/state1
|
||||||
|
25:0:3200:12500:1250
|
||||||
|
|
||||||
|
When a given field is not populated or its value provided by the platform
|
||||||
|
firmware is invalid, the "not-defined" string is shown instead of the value.
|
@@ -12,3 +12,4 @@ the Linux ACPI support.
|
|||||||
dsdt-override
|
dsdt-override
|
||||||
ssdt-overlays
|
ssdt-overlays
|
||||||
cppc_sysfs
|
cppc_sysfs
|
||||||
|
fan_performance_states
|
||||||
|
@@ -1,15 +1,15 @@
|
|||||||
========================================
|
========================================
|
||||||
zram: Compressed RAM based block devices
|
zram: Compressed RAM-based block devices
|
||||||
========================================
|
========================================
|
||||||
|
|
||||||
Introduction
|
Introduction
|
||||||
============
|
============
|
||||||
|
|
||||||
The zram module creates RAM based block devices named /dev/zram<id>
|
The zram module creates RAM-based block devices named /dev/zram<id>
|
||||||
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
|
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
|
||||||
in memory itself. These disks allow very fast I/O and compression provides
|
in memory itself. These disks allow very fast I/O and compression provides
|
||||||
good amounts of memory savings. Some of the usecases include /tmp storage,
|
good amounts of memory savings. Some of the use cases include /tmp storage,
|
||||||
use as swap disks, various caches under /var and maybe many more :)
|
use as swap disks, various caches under /var and maybe many more. :)
|
||||||
|
|
||||||
Statistics for individual zram devices are exported through sysfs nodes at
|
Statistics for individual zram devices are exported through sysfs nodes at
|
||||||
/sys/block/zram<id>/
|
/sys/block/zram<id>/
|
||||||
@@ -43,17 +43,17 @@ The list of possible return codes:
|
|||||||
|
|
||||||
======== =============================================================
|
======== =============================================================
|
||||||
-EBUSY an attempt to modify an attribute that cannot be changed once
|
-EBUSY an attempt to modify an attribute that cannot be changed once
|
||||||
the device has been initialised. Please reset device first;
|
the device has been initialised. Please reset device first.
|
||||||
-ENOMEM zram was not able to allocate enough memory to fulfil your
|
-ENOMEM zram was not able to allocate enough memory to fulfil your
|
||||||
needs;
|
needs.
|
||||||
-EINVAL invalid input has been provided.
|
-EINVAL invalid input has been provided.
|
||||||
======== =============================================================
|
======== =============================================================
|
||||||
|
|
||||||
If you use 'echo', the returned value that is changed by 'echo' utility,
|
If you use 'echo', the returned value is set by the 'echo' utility,
|
||||||
and, in general case, something like::
|
and, in general case, something like::
|
||||||
|
|
||||||
echo 3 > /sys/block/zram0/max_comp_streams
|
echo 3 > /sys/block/zram0/max_comp_streams
|
||||||
if [ $? -ne 0 ];
|
if [ $? -ne 0 ]; then
|
||||||
handle_error
|
handle_error
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -65,7 +65,8 @@ should suffice.
|
|||||||
::
|
::
|
||||||
|
|
||||||
modprobe zram num_devices=4
|
modprobe zram num_devices=4
|
||||||
This creates 4 devices: /dev/zram{0,1,2,3}
|
|
||||||
|
This creates 4 devices: /dev/zram{0,1,2,3}
|
||||||
|
|
||||||
num_devices parameter is optional and tells zram how many devices should be
|
num_devices parameter is optional and tells zram how many devices should be
|
||||||
pre-created. Default: 1.
|
pre-created. Default: 1.
|
||||||
@@ -73,12 +74,12 @@ pre-created. Default: 1.
|
|||||||
2) Set max number of compression streams
|
2) Set max number of compression streams
|
||||||
========================================
|
========================================
|
||||||
|
|
||||||
Regardless the value passed to this attribute, ZRAM will always
|
Regardless of the value passed to this attribute, ZRAM will always
|
||||||
allocate multiple compression streams - one per online CPUs - thus
|
allocate multiple compression streams - one per online CPU - thus
|
||||||
allowing several concurrent compression operations. The number of
|
allowing several concurrent compression operations. The number of
|
||||||
allocated compression streams goes down when some of the CPUs
|
allocated compression streams goes down when some of the CPUs
|
||||||
become offline. There is no single-compression-stream mode anymore,
|
become offline. There is no single-compression-stream mode anymore,
|
||||||
unless you are running a UP system or has only 1 CPU online.
|
unless you are running a UP system or have only 1 CPU online.
|
||||||
|
|
||||||
To find out how many streams are currently available::
|
To find out how many streams are currently available::
|
||||||
|
|
||||||
@@ -89,7 +90,7 @@ To find out how many streams are currently available::
|
|||||||
|
|
||||||
Using comp_algorithm device attribute one can see available and
|
Using comp_algorithm device attribute one can see available and
|
||||||
currently selected (shown in square brackets) compression algorithms,
|
currently selected (shown in square brackets) compression algorithms,
|
||||||
change selected compression algorithm (once the device is initialised
|
or change the selected compression algorithm (once the device is initialised
|
||||||
there is no way to change compression algorithm).
|
there is no way to change compression algorithm).
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
@@ -167,9 +168,9 @@ Examples::
|
|||||||
zram provides a control interface, which enables dynamic (on-demand) device
|
zram provides a control interface, which enables dynamic (on-demand) device
|
||||||
addition and removal.
|
addition and removal.
|
||||||
|
|
||||||
In order to add a new /dev/zramX device, perform read operation on hot_add
|
In order to add a new /dev/zramX device, perform a read operation on the hot_add
|
||||||
attribute. This will return either new device's device id (meaning that you
|
attribute. This will return either the new device's device id (meaning that you
|
||||||
can use /dev/zram<id>) or error code.
|
can use /dev/zram<id>) or an error code.
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
@@ -186,8 +187,8 @@ execute::
|
|||||||
|
|
||||||
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
||||||
|
|
||||||
A brief description of exported device attributes. For more details please
|
A brief description of exported device attributes follows. For more details
|
||||||
read Documentation/ABI/testing/sysfs-block-zram.
|
please read Documentation/ABI/testing/sysfs-block-zram.
|
||||||
|
|
||||||
====================== ====== ===============================================
|
====================== ====== ===============================================
|
||||||
Name access description
|
Name access description
|
||||||
@@ -245,7 +246,7 @@ whitespace:
|
|||||||
|
|
||||||
File /sys/block/zram<id>/mm_stat
|
File /sys/block/zram<id>/mm_stat
|
||||||
|
|
||||||
The stat file represents device's mm statistics. It consists of a single
|
The mm_stat file represents the device's mm statistics. It consists of a single
|
||||||
line of text and contains the following stats separated by whitespace:
|
line of text and contains the following stats separated by whitespace:
|
||||||
|
|
||||||
================ =============================================================
|
================ =============================================================
|
||||||
@@ -261,7 +262,7 @@ line of text and contains the following stats separated by whitespace:
|
|||||||
Unit: bytes
|
Unit: bytes
|
||||||
mem_limit the maximum amount of memory ZRAM can use to store
|
mem_limit the maximum amount of memory ZRAM can use to store
|
||||||
the compressed data
|
the compressed data
|
||||||
mem_used_max the maximum amount of memory zram have consumed to
|
mem_used_max the maximum amount of memory zram has consumed to
|
||||||
store the data
|
store the data
|
||||||
same_pages the number of same element filled pages written to this disk.
|
same_pages the number of same element filled pages written to this disk.
|
||||||
No memory is allocated for such pages.
|
No memory is allocated for such pages.
|
||||||
@@ -271,7 +272,7 @@ line of text and contains the following stats separated by whitespace:
|
|||||||
|
|
||||||
File /sys/block/zram<id>/bd_stat
|
File /sys/block/zram<id>/bd_stat
|
||||||
|
|
||||||
The stat file represents device's backing device statistics. It consists of
|
The bd_stat file represents a device's backing device statistics. It consists of
|
||||||
a single line of text and contains the following stats separated by whitespace:
|
a single line of text and contains the following stats separated by whitespace:
|
||||||
|
|
||||||
============== =============================================================
|
============== =============================================================
|
||||||
@@ -316,9 +317,9 @@ To use the feature, admin should set up backing device via::
|
|||||||
echo /dev/sda5 > /sys/block/zramX/backing_dev
|
echo /dev/sda5 > /sys/block/zramX/backing_dev
|
||||||
|
|
||||||
before disksize setting. It supports only partition at this moment.
|
before disksize setting. It supports only partition at this moment.
|
||||||
If admin want to use incompressible page writeback, they could do via::
|
If admin wants to use incompressible page writeback, they could do via::
|
||||||
|
|
||||||
echo huge > /sys/block/zramX/write
|
echo huge > /sys/block/zramX/writeback
|
||||||
|
|
||||||
To use idle page writeback, first, user need to declare zram pages
|
To use idle page writeback, first, user need to declare zram pages
|
||||||
as idle::
|
as idle::
|
||||||
@@ -326,7 +327,7 @@ as idle::
|
|||||||
echo all > /sys/block/zramX/idle
|
echo all > /sys/block/zramX/idle
|
||||||
|
|
||||||
From now on, any pages on zram are idle pages. The idle mark
|
From now on, any pages on zram are idle pages. The idle mark
|
||||||
will be removed until someone request access of the block.
|
will be removed until someone requests access of the block.
|
||||||
IOW, unless there is access request, those pages are still idle pages.
|
IOW, unless there is access request, those pages are still idle pages.
|
||||||
|
|
||||||
Admin can request writeback of those idle pages at right timing via::
|
Admin can request writeback of those idle pages at right timing via::
|
||||||
@@ -341,16 +342,16 @@ to guarantee storage health for entire product life.
|
|||||||
|
|
||||||
To overcome the concern, zram supports "writeback_limit" feature.
|
To overcome the concern, zram supports "writeback_limit" feature.
|
||||||
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
||||||
any writeback. IOW, if admin want to apply writeback budget, he should
|
any writeback. IOW, if admin wants to apply writeback budget, he should
|
||||||
enable writeback_limit_enable via::
|
enable writeback_limit_enable via::
|
||||||
|
|
||||||
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
||||||
|
|
||||||
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
||||||
until admin set the budget via /sys/block/zramX/writeback_limit.
|
until admin sets the budget via /sys/block/zramX/writeback_limit.
|
||||||
|
|
||||||
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
||||||
assigned via /sys/block/zramX/writeback_limit is meaninless.)
|
assigned via /sys/block/zramX/writeback_limit is meaningless.)
|
||||||
|
|
||||||
If admin want to limit writeback as per-day 400M, he could do it
|
If admin want to limit writeback as per-day 400M, he could do it
|
||||||
like below::
|
like below::
|
||||||
@@ -361,13 +362,13 @@ like below::
|
|||||||
/sys/block/zram0/writeback_limit.
|
/sys/block/zram0/writeback_limit.
|
||||||
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
||||||
|
|
||||||
If admin want to allow further write again once the bugdet is exausted,
|
If admins want to allow further write again once the bugdet is exhausted,
|
||||||
he could do it like below::
|
he could do it like below::
|
||||||
|
|
||||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||||
/sys/block/zram0/writeback_limit
|
/sys/block/zram0/writeback_limit
|
||||||
|
|
||||||
If admin want to see remaining writeback budget since he set::
|
If admin wants to see remaining writeback budget since last set::
|
||||||
|
|
||||||
$ cat /sys/block/zramX/writeback_limit
|
$ cat /sys/block/zramX/writeback_limit
|
||||||
|
|
||||||
@@ -375,12 +376,12 @@ If admin want to disable writeback limit, he could do::
|
|||||||
|
|
||||||
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
||||||
|
|
||||||
The writeback_limit count will reset whenever you reset zram(e.g.,
|
The writeback_limit count will reset whenever you reset zram (e.g.,
|
||||||
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
||||||
writeback happened until you reset the zram to allocate extra writeback
|
writeback happened until you reset the zram to allocate extra writeback
|
||||||
budget in next setting is user's job.
|
budget in next setting is user's job.
|
||||||
|
|
||||||
If admin want to measure writeback count in a certain period, he could
|
If admin wants to measure writeback count in a certain period, he could
|
||||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||||
|
|
||||||
memory tracking
|
memory tracking
|
||||||
|
218
Documentation/admin-guide/bootconfig.rst
Normal file
218
Documentation/admin-guide/bootconfig.rst
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
.. _bootconfig:
|
||||||
|
|
||||||
|
==================
|
||||||
|
Boot Configuration
|
||||||
|
==================
|
||||||
|
|
||||||
|
:Author: Masami Hiramatsu <mhiramat@kernel.org>
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
The boot configuration expands the current kernel command line to support
|
||||||
|
additional key-value data when booting the kernel in an efficient way.
|
||||||
|
This allows administrators to pass a structured-Key config file.
|
||||||
|
|
||||||
|
Config File Syntax
|
||||||
|
==================
|
||||||
|
|
||||||
|
The boot config syntax is a simple structured key-value. Each key consists
|
||||||
|
of dot-connected-words, and key and value are connected by ``=``. The value
|
||||||
|
has to be terminated by semi-colon (``;``) or newline (``\n``).
|
||||||
|
For array value, array entries are separated by comma (``,``). ::
|
||||||
|
|
||||||
|
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
||||||
|
|
||||||
|
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
|
||||||
|
|
||||||
|
Each key word must contain only alphabets, numbers, dash (``-``) or underscore
|
||||||
|
(``_``). And each value only contains printable characters or spaces except
|
||||||
|
for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``),
|
||||||
|
hash (``#``) and closing brace (``}``).
|
||||||
|
|
||||||
|
If you want to use those delimiters in a value, you can use either double-
|
||||||
|
quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that
|
||||||
|
you can not escape these quotes.
|
||||||
|
|
||||||
|
There can be a key which doesn't have value or has an empty value. Those keys
|
||||||
|
are used for checking if the key exists or not (like a boolean).
|
||||||
|
|
||||||
|
Key-Value Syntax
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The boot config file syntax allows user to merge partially same word keys
|
||||||
|
by brace. For example::
|
||||||
|
|
||||||
|
foo.bar.baz = value1
|
||||||
|
foo.bar.qux.quux = value2
|
||||||
|
|
||||||
|
These can be written also in::
|
||||||
|
|
||||||
|
foo.bar {
|
||||||
|
baz = value1
|
||||||
|
qux.quux = value2
|
||||||
|
}
|
||||||
|
|
||||||
|
Or more shorter, written as following::
|
||||||
|
|
||||||
|
foo.bar { baz = value1; qux.quux = value2 }
|
||||||
|
|
||||||
|
In both styles, same key words are automatically merged when parsing it
|
||||||
|
at boot time. So you can append similar trees or key-values.
|
||||||
|
|
||||||
|
Same-key Values
|
||||||
|
---------------
|
||||||
|
|
||||||
|
It is prohibited that two or more values or arrays share a same-key.
|
||||||
|
For example,::
|
||||||
|
|
||||||
|
foo = bar, baz
|
||||||
|
foo = qux # !ERROR! we can not re-define same key
|
||||||
|
|
||||||
|
If you want to append the value to existing key as an array member,
|
||||||
|
you can use ``+=`` operator. For example::
|
||||||
|
|
||||||
|
foo = bar, baz
|
||||||
|
foo += qux
|
||||||
|
|
||||||
|
In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``.
|
||||||
|
|
||||||
|
However, a sub-key and a value can not co-exist under a parent key.
|
||||||
|
For example, following config is NOT allowed.::
|
||||||
|
|
||||||
|
foo = value1
|
||||||
|
foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
|
||||||
|
|
||||||
|
|
||||||
|
Comments
|
||||||
|
--------
|
||||||
|
|
||||||
|
The config syntax accepts shell-script style comments. The comments starting
|
||||||
|
with hash ("#") until newline ("\n") will be ignored.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
# comment line
|
||||||
|
foo = value # value is set to foo.
|
||||||
|
bar = 1, # 1st element
|
||||||
|
2, # 2nd element
|
||||||
|
3 # 3rd element
|
||||||
|
|
||||||
|
This is parsed as below::
|
||||||
|
|
||||||
|
foo = value
|
||||||
|
bar = 1, 2, 3
|
||||||
|
|
||||||
|
Note that you can not put a comment between value and delimiter(``,`` or
|
||||||
|
``;``). This means following config has a syntax error ::
|
||||||
|
|
||||||
|
key = 1 # comment
|
||||||
|
,2
|
||||||
|
|
||||||
|
|
||||||
|
/proc/bootconfig
|
||||||
|
================
|
||||||
|
|
||||||
|
/proc/bootconfig is a user-space interface of the boot config.
|
||||||
|
Unlike /proc/cmdline, this file shows the key-value style list.
|
||||||
|
Each key-value pair is shown in each line with following style::
|
||||||
|
|
||||||
|
KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...]
|
||||||
|
|
||||||
|
|
||||||
|
Boot Kernel With a Boot Config
|
||||||
|
==============================
|
||||||
|
|
||||||
|
Since the boot configuration file is loaded with initrd, it will be added
|
||||||
|
to the end of the initrd (initramfs) image file with size, checksum and
|
||||||
|
12-byte magic word as below.
|
||||||
|
|
||||||
|
[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
|
||||||
|
|
||||||
|
The Linux kernel decodes the last part of the initrd image in memory to
|
||||||
|
get the boot configuration data.
|
||||||
|
Because of this "piggyback" method, there is no need to change or
|
||||||
|
update the boot loader and the kernel image itself.
|
||||||
|
|
||||||
|
To do this operation, Linux kernel provides "bootconfig" command under
|
||||||
|
tools/bootconfig, which allows admin to apply or delete the config file
|
||||||
|
to/from initrd image. You can build it by the following command::
|
||||||
|
|
||||||
|
# make -C tools/bootconfig
|
||||||
|
|
||||||
|
To add your boot config file to initrd image, run bootconfig as below
|
||||||
|
(Old data is removed automatically if exists)::
|
||||||
|
|
||||||
|
# tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z
|
||||||
|
|
||||||
|
To remove the config from the image, you can use -d option as below::
|
||||||
|
|
||||||
|
# tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z
|
||||||
|
|
||||||
|
Then add "bootconfig" on the normal kernel command line to tell the
|
||||||
|
kernel to look for the bootconfig at the end of the initrd file.
|
||||||
|
|
||||||
|
Config File Limitation
|
||||||
|
======================
|
||||||
|
|
||||||
|
Currently the maximum config size size is 32KB and the total key-words (not
|
||||||
|
key-value entries) must be under 1024 nodes.
|
||||||
|
Note: this is not the number of entries but nodes, an entry must consume
|
||||||
|
more than 2 nodes (a key-word and a value). So theoretically, it will be
|
||||||
|
up to 512 key-value pairs. If keys contains 3 words in average, it can
|
||||||
|
contain 256 key-value pairs. In most cases, the number of config items
|
||||||
|
will be under 100 entries and smaller than 8KB, so it would be enough.
|
||||||
|
If the node number exceeds 1024, parser returns an error even if the file
|
||||||
|
size is smaller than 32KB.
|
||||||
|
Anyway, since bootconfig command verifies it when appending a boot config
|
||||||
|
to initrd image, user can notice it before boot.
|
||||||
|
|
||||||
|
|
||||||
|
Bootconfig APIs
|
||||||
|
===============
|
||||||
|
|
||||||
|
User can query or loop on key-value pairs, also it is possible to find
|
||||||
|
a root (prefix) key node and find key-values under that node.
|
||||||
|
|
||||||
|
If you have a key string, you can query the value directly with the key
|
||||||
|
using xbc_find_value(). If you want to know what keys exist in the boot
|
||||||
|
config, you can use xbc_for_each_key_value() to iterate key-value pairs.
|
||||||
|
Note that you need to use xbc_array_for_each_value() for accessing
|
||||||
|
each array's value, e.g.::
|
||||||
|
|
||||||
|
vnode = NULL;
|
||||||
|
xbc_find_value("key.word", &vnode);
|
||||||
|
if (vnode && xbc_node_is_array(vnode))
|
||||||
|
xbc_array_for_each_value(vnode, value) {
|
||||||
|
printk("%s ", value);
|
||||||
|
}
|
||||||
|
|
||||||
|
If you want to focus on keys which have a prefix string, you can use
|
||||||
|
xbc_find_node() to find a node by the prefix string, and iterate
|
||||||
|
keys under the prefix node with xbc_node_for_each_key_value().
|
||||||
|
|
||||||
|
But the most typical usage is to get the named value under prefix
|
||||||
|
or get the named array under prefix as below::
|
||||||
|
|
||||||
|
root = xbc_find_node("key.prefix");
|
||||||
|
value = xbc_node_find_value(root, "option", &vnode);
|
||||||
|
...
|
||||||
|
xbc_node_for_each_array_value(root, "array-option", value, anode) {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
This accesses a value of "key.prefix.option" and an array of
|
||||||
|
"key.prefix.array-option".
|
||||||
|
|
||||||
|
Locking is not needed, since after initialization, the config becomes
|
||||||
|
read-only. All data and keys must be copied if you need to modify it.
|
||||||
|
|
||||||
|
|
||||||
|
Functions and structures
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/bootconfig.h
|
||||||
|
.. kernel-doc:: lib/bootconfig.c
|
||||||
|
|
@@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/.
|
|||||||
5-6. Device
|
5-6. Device
|
||||||
5-7. RDMA
|
5-7. RDMA
|
||||||
5-7-1. RDMA Interface Files
|
5-7-1. RDMA Interface Files
|
||||||
|
5-8. HugeTLB
|
||||||
|
5.8-1. HugeTLB Interface Files
|
||||||
5-8. Misc
|
5-8. Misc
|
||||||
5-8-1. perf_event
|
5-8-1. perf_event
|
||||||
5-N. Non-normative information
|
5-N. Non-normative information
|
||||||
@@ -2056,6 +2058,33 @@ RDMA Interface Files
|
|||||||
mlx4_0 hca_handle=1 hca_object=20
|
mlx4_0 hca_handle=1 hca_object=20
|
||||||
ocrdma1 hca_handle=1 hca_object=23
|
ocrdma1 hca_handle=1 hca_object=23
|
||||||
|
|
||||||
|
HugeTLB
|
||||||
|
-------
|
||||||
|
|
||||||
|
The HugeTLB controller allows to limit the HugeTLB usage per control group and
|
||||||
|
enforces the controller limit during page fault.
|
||||||
|
|
||||||
|
HugeTLB Interface Files
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.current
|
||||||
|
Show current usage for "hugepagesize" hugetlb. It exists for all
|
||||||
|
the cgroup except root.
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.max
|
||||||
|
Set/show the hard limit of "hugepagesize" hugetlb usage.
|
||||||
|
The default value is "max". It exists for all the cgroup except root.
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.events
|
||||||
|
A read-only flat-keyed file which exists on non-root cgroups.
|
||||||
|
|
||||||
|
max
|
||||||
|
The number of allocation failure due to HugeTLB limit
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.events.local
|
||||||
|
Similar to hugetlb.<hugepagesize>.events but the fields in the file
|
||||||
|
are local to the cgroup i.e. not hierarchical. The file modified event
|
||||||
|
generated on this file reflects only the local events.
|
||||||
|
|
||||||
Misc
|
Misc
|
||||||
----
|
----
|
||||||
|
@@ -419,3 +419,5 @@ Version History
|
|||||||
rebuild errors.
|
rebuild errors.
|
||||||
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
|
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
|
||||||
pages allocated; also fix those not occuring after previous reductions
|
pages allocated; also fix those not occuring after previous reductions
|
||||||
|
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
|
||||||
|
on the status line.
|
||||||
|
@@ -92,6 +92,8 @@ Currently Available
|
|||||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||||
the ordering)
|
the ordering)
|
||||||
* Case-insensitive file name lookups
|
* Case-insensitive file name lookups
|
||||||
|
* file-based encryption support (fscrypt)
|
||||||
|
* file-based verity support (fsverity)
|
||||||
|
|
||||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||||
directory hash tree having a maximum depth of two.
|
directory hash tree having a maximum depth of two.
|
||||||
|
@@ -64,6 +64,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
binderfs
|
binderfs
|
||||||
binfmt-misc
|
binfmt-misc
|
||||||
blockdev/index
|
blockdev/index
|
||||||
|
bootconfig
|
||||||
braille-console
|
braille-console
|
||||||
btmrvl
|
btmrvl
|
||||||
cgroup-v1/index
|
cgroup-v1/index
|
||||||
@@ -76,6 +77,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
device-mapper/index
|
device-mapper/index
|
||||||
efi-stub
|
efi-stub
|
||||||
ext4
|
ext4
|
||||||
|
nfs/index
|
||||||
gpio/index
|
gpio/index
|
||||||
highuid
|
highuid
|
||||||
hw_random
|
hw_random
|
||||||
|
@@ -437,6 +437,12 @@
|
|||||||
no delay (0).
|
no delay (0).
|
||||||
Format: integer
|
Format: integer
|
||||||
|
|
||||||
|
bootconfig [KNL]
|
||||||
|
Extended command line options can be added to an initrd
|
||||||
|
and this will cause the kernel to look for it.
|
||||||
|
|
||||||
|
See Documentation/admin-guide/bootconfig.rst
|
||||||
|
|
||||||
bert_disable [ACPI]
|
bert_disable [ACPI]
|
||||||
Disable BERT OS support on buggy BIOSes.
|
Disable BERT OS support on buggy BIOSes.
|
||||||
|
|
||||||
@@ -511,7 +517,7 @@
|
|||||||
1 -- check protection requested by application.
|
1 -- check protection requested by application.
|
||||||
Default value is set via a kernel config option.
|
Default value is set via a kernel config option.
|
||||||
Value can be changed at runtime via
|
Value can be changed at runtime via
|
||||||
/selinux/checkreqprot.
|
/sys/fs/selinux/checkreqprot.
|
||||||
|
|
||||||
cio_ignore= [S390]
|
cio_ignore= [S390]
|
||||||
See Documentation/s390/common_io.rst for details.
|
See Documentation/s390/common_io.rst for details.
|
||||||
@@ -834,6 +840,18 @@
|
|||||||
dump out devices still on the deferred probe list after
|
dump out devices still on the deferred probe list after
|
||||||
retrying.
|
retrying.
|
||||||
|
|
||||||
|
dfltcc= [HW,S390]
|
||||||
|
Format: { on | off | def_only | inf_only | always }
|
||||||
|
on: s390 zlib hardware support for compression on
|
||||||
|
level 1 and decompression (default)
|
||||||
|
off: No s390 zlib hardware support
|
||||||
|
def_only: s390 zlib hardware support for deflate
|
||||||
|
only (compression on level 1)
|
||||||
|
inf_only: s390 zlib hardware support for inflate
|
||||||
|
only (decompression)
|
||||||
|
always: Same as 'on' but ignores the selected compression
|
||||||
|
level always using hardware support (used for debugging)
|
||||||
|
|
||||||
dhash_entries= [KNL]
|
dhash_entries= [KNL]
|
||||||
Set number of hash buckets for dentry cache.
|
Set number of hash buckets for dentry cache.
|
||||||
|
|
||||||
@@ -1165,10 +1183,10 @@
|
|||||||
|
|
||||||
efi= [EFI]
|
efi= [EFI]
|
||||||
Format: { "old_map", "nochunk", "noruntime", "debug",
|
Format: { "old_map", "nochunk", "noruntime", "debug",
|
||||||
"nosoftreserve" }
|
"nosoftreserve", "disable_early_pci_dma",
|
||||||
|
"no_disable_early_pci_dma" }
|
||||||
old_map [X86-64]: switch to the old ioremap-based EFI
|
old_map [X86-64]: switch to the old ioremap-based EFI
|
||||||
runtime services mapping. 32-bit still uses this one by
|
runtime services mapping. [Needs CONFIG_X86_UV=y]
|
||||||
default.
|
|
||||||
nochunk: disable reading files in "chunks" in the EFI
|
nochunk: disable reading files in "chunks" in the EFI
|
||||||
boot stub, as chunking can cause problems with some
|
boot stub, as chunking can cause problems with some
|
||||||
firmware implementations.
|
firmware implementations.
|
||||||
@@ -1180,6 +1198,10 @@
|
|||||||
claim. Specify efi=nosoftreserve to disable this
|
claim. Specify efi=nosoftreserve to disable this
|
||||||
reservation and treat the memory by its base type
|
reservation and treat the memory by its base type
|
||||||
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
|
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
|
||||||
|
disable_early_pci_dma: Disable the busmaster bit on all
|
||||||
|
PCI bridges while in the EFI boot stub
|
||||||
|
no_disable_early_pci_dma: Leave the busmaster bit set
|
||||||
|
on all PCI bridges while in the EFI boot stub
|
||||||
|
|
||||||
efi_no_storage_paranoia [EFI; X86]
|
efi_no_storage_paranoia [EFI; X86]
|
||||||
Using this parameter you can use more than 50% of
|
Using this parameter you can use more than 50% of
|
||||||
@@ -1245,7 +1267,8 @@
|
|||||||
0 -- permissive (log only, no denials).
|
0 -- permissive (log only, no denials).
|
||||||
1 -- enforcing (deny and log).
|
1 -- enforcing (deny and log).
|
||||||
Default value is 0.
|
Default value is 0.
|
||||||
Value can be changed at runtime via /selinux/enforce.
|
Value can be changed at runtime via
|
||||||
|
/sys/fs/selinux/enforce.
|
||||||
|
|
||||||
erst_disable [ACPI]
|
erst_disable [ACPI]
|
||||||
Disable Error Record Serialization Table (ERST)
|
Disable Error Record Serialization Table (ERST)
|
||||||
@@ -1933,10 +1956,32 @@
|
|||||||
<cpu number> begins at 0 and the maximum value is
|
<cpu number> begins at 0 and the maximum value is
|
||||||
"number of CPUs in system - 1".
|
"number of CPUs in system - 1".
|
||||||
|
|
||||||
|
managed_irq
|
||||||
|
|
||||||
|
Isolate from being targeted by managed interrupts
|
||||||
|
which have an interrupt mask containing isolated
|
||||||
|
CPUs. The affinity of managed interrupts is
|
||||||
|
handled by the kernel and cannot be changed via
|
||||||
|
the /proc/irq/* interfaces.
|
||||||
|
|
||||||
|
This isolation is best effort and only effective
|
||||||
|
if the automatically assigned interrupt mask of a
|
||||||
|
device queue contains isolated and housekeeping
|
||||||
|
CPUs. If housekeeping CPUs are online then such
|
||||||
|
interrupts are directed to the housekeeping CPU
|
||||||
|
so that IO submitted on the housekeeping CPU
|
||||||
|
cannot disturb the isolated CPU.
|
||||||
|
|
||||||
|
If a queue's affinity mask contains only isolated
|
||||||
|
CPUs then this parameter has no effect on the
|
||||||
|
interrupt routing decision, though interrupts are
|
||||||
|
only delivered when tasks running on those
|
||||||
|
isolated CPUs submit IO. IO submitted on
|
||||||
|
housekeeping CPUs has no influence on those
|
||||||
|
queues.
|
||||||
|
|
||||||
The format of <cpu-list> is described above.
|
The format of <cpu-list> is described above.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
iucv= [HW,NET]
|
iucv= [HW,NET]
|
||||||
|
|
||||||
ivrs_ioapic [HW,X86_64]
|
ivrs_ioapic [HW,X86_64]
|
||||||
@@ -3978,6 +4023,19 @@
|
|||||||
test until boot completes in order to avoid
|
test until boot completes in order to avoid
|
||||||
interference.
|
interference.
|
||||||
|
|
||||||
|
rcuperf.kfree_rcu_test= [KNL]
|
||||||
|
Set to measure performance of kfree_rcu() flooding.
|
||||||
|
|
||||||
|
rcuperf.kfree_nthreads= [KNL]
|
||||||
|
The number of threads running loops of kfree_rcu().
|
||||||
|
|
||||||
|
rcuperf.kfree_alloc_num= [KNL]
|
||||||
|
Number of allocations and frees done in an iteration.
|
||||||
|
|
||||||
|
rcuperf.kfree_loops= [KNL]
|
||||||
|
Number of loops doing rcuperf.kfree_alloc_num number
|
||||||
|
of allocations and frees.
|
||||||
|
|
||||||
rcuperf.nreaders= [KNL]
|
rcuperf.nreaders= [KNL]
|
||||||
Set number of RCU readers. The value -1 selects
|
Set number of RCU readers. The value -1 selects
|
||||||
N, where N is the number of CPUs. A value
|
N, where N is the number of CPUs. A value
|
||||||
@@ -4348,9 +4406,7 @@
|
|||||||
See security/selinux/Kconfig help text.
|
See security/selinux/Kconfig help text.
|
||||||
0 -- disable.
|
0 -- disable.
|
||||||
1 -- enable.
|
1 -- enable.
|
||||||
Default value is set via kernel config option.
|
Default value is 1.
|
||||||
If enabled at boot time, /selinux/disable can be used
|
|
||||||
later to disable prior to initial policy load.
|
|
||||||
|
|
||||||
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
|
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
|
||||||
Format: { "0" | "1" }
|
Format: { "0" | "1" }
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
|
===================
|
||||||
|
NFS Fault Injection
|
||||||
|
===================
|
||||||
|
|
||||||
Fault Injection
|
|
||||||
===============
|
|
||||||
Fault injection is a method for forcing errors that may not normally occur, or
|
Fault injection is a method for forcing errors that may not normally occur, or
|
||||||
may be difficult to reproduce. Forcing these errors in a controlled environment
|
may be difficult to reproduce. Forcing these errors in a controlled environment
|
||||||
can help the developer find and fix bugs before their code is shipped in a
|
can help the developer find and fix bugs before their code is shipped in a
|
15
Documentation/admin-guide/nfs/index.rst
Normal file
15
Documentation/admin-guide/nfs/index.rst
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
=============
|
||||||
|
NFS
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
nfs-client
|
||||||
|
nfsroot
|
||||||
|
nfs-rdma
|
||||||
|
nfsd-admin-interfaces
|
||||||
|
nfs-idmapper
|
||||||
|
pnfs-block-server
|
||||||
|
pnfs-scsi-server
|
||||||
|
fault_injection
|
@@ -1,3 +1,6 @@
|
|||||||
|
==========
|
||||||
|
NFS Client
|
||||||
|
==========
|
||||||
|
|
||||||
The NFS client
|
The NFS client
|
||||||
==============
|
==============
|
||||||
@@ -59,10 +62,11 @@ The DNS resolver
|
|||||||
|
|
||||||
NFSv4 allows for one server to refer the NFS client to data that has been
|
NFSv4 allows for one server to refer the NFS client to data that has been
|
||||||
migrated onto another server by means of the special "fs_locations"
|
migrated onto another server by means of the special "fs_locations"
|
||||||
attribute. See
|
attribute. See `RFC3530 Section 6: Filesystem Migration and Replication`_ and
|
||||||
http://tools.ietf.org/html/rfc3530#section-6
|
`Implementation Guide for Referrals in NFSv4`_.
|
||||||
and
|
|
||||||
http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
|
.. _RFC3530 Section 6\: Filesystem Migration and Replication: http://tools.ietf.org/html/rfc3530#section-6
|
||||||
|
.. _Implementation Guide for Referrals in NFSv4: http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
|
||||||
|
|
||||||
The fs_locations information can take the form of either an ip address and
|
The fs_locations information can take the form of either an ip address and
|
||||||
a path, or a DNS hostname and a path. The latter requires the NFS client to
|
a path, or a DNS hostname and a path. The latter requires the NFS client to
|
||||||
@@ -78,8 +82,8 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
|
|||||||
(2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
|
(2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
|
||||||
(may be changed using the 'nfs.cache_getent' kernel boot parameter)
|
(may be changed using the 'nfs.cache_getent' kernel boot parameter)
|
||||||
is run, with two arguments:
|
is run, with two arguments:
|
||||||
- the cache name, "dns_resolve"
|
- the cache name, "dns_resolve"
|
||||||
- the hostname to resolve
|
- the hostname to resolve
|
||||||
|
|
||||||
(3) After looking up the corresponding ip address, the helper script
|
(3) After looking up the corresponding ip address, the helper script
|
||||||
writes the result into the rpc_pipefs pseudo-file
|
writes the result into the rpc_pipefs pseudo-file
|
||||||
@@ -94,43 +98,44 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
|
|||||||
script, and <ttl> is the 'time to live' of this cache entry (in
|
script, and <ttl> is the 'time to live' of this cache entry (in
|
||||||
units of seconds).
|
units of seconds).
|
||||||
|
|
||||||
Note: If <ip address> is invalid, say the string "0", then a negative
|
.. note::
|
||||||
entry is created, which will cause the kernel to treat the hostname
|
If <ip address> is invalid, say the string "0", then a negative
|
||||||
as having no valid DNS translation.
|
entry is created, which will cause the kernel to treat the hostname
|
||||||
|
as having no valid DNS translation.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
A basic sample /sbin/nfs_cache_getent
|
A basic sample /sbin/nfs_cache_getent
|
||||||
=====================================
|
=====================================
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
ttl=600
|
ttl=600
|
||||||
#
|
#
|
||||||
cut=/usr/bin/cut
|
cut=/usr/bin/cut
|
||||||
getent=/usr/bin/getent
|
getent=/usr/bin/getent
|
||||||
rpc_pipefs=/var/lib/nfs/rpc_pipefs
|
rpc_pipefs=/var/lib/nfs/rpc_pipefs
|
||||||
#
|
#
|
||||||
die()
|
die()
|
||||||
{
|
{
|
||||||
echo "Usage: $0 cache_name entry_name"
|
echo "Usage: $0 cache_name entry_name"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
[ $# -lt 2 ] && die
|
[ $# -lt 2 ] && die
|
||||||
cachename="$1"
|
cachename="$1"
|
||||||
cache_path=${rpc_pipefs}/cache/${cachename}/channel
|
cache_path=${rpc_pipefs}/cache/${cachename}/channel
|
||||||
|
|
||||||
case "${cachename}" in
|
|
||||||
dns_resolve)
|
|
||||||
name="$2"
|
|
||||||
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
|
|
||||||
[ -z "${result}" ] && result="0"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
die
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
echo "${result} ${name} ${ttl}" >${cache_path}
|
|
||||||
|
|
||||||
|
case "${cachename}" in
|
||||||
|
dns_resolve)
|
||||||
|
name="$2"
|
||||||
|
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
|
||||||
|
[ -z "${result}" ] && result="0"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
echo "${result} ${name} ${ttl}" >${cache_path}
|
@@ -1,7 +1,7 @@
|
|||||||
|
=============
|
||||||
|
NFS ID Mapper
|
||||||
|
=============
|
||||||
|
|
||||||
=========
|
|
||||||
ID Mapper
|
|
||||||
=========
|
|
||||||
Id mapper is used by NFS to translate user and group ids into names, and to
|
Id mapper is used by NFS to translate user and group ids into names, and to
|
||||||
translate user and group names into ids. Part of this translation involves
|
translate user and group names into ids. Part of this translation involves
|
||||||
performing an upcall to userspace to request the information. There are two
|
performing an upcall to userspace to request the information. There are two
|
||||||
@@ -20,22 +20,24 @@ legacy rpc.idmap daemon for the id mapping. This result will be stored
|
|||||||
in a custom NFS idmap cache.
|
in a custom NFS idmap cache.
|
||||||
|
|
||||||
|
|
||||||
===========
|
|
||||||
Configuring
|
Configuring
|
||||||
===========
|
===========
|
||||||
|
|
||||||
The file /etc/request-key.conf will need to be modified so /sbin/request-key can
|
The file /etc/request-key.conf will need to be modified so /sbin/request-key can
|
||||||
direct the upcall. The following line should be added:
|
direct the upcall. The following line should be added:
|
||||||
|
|
||||||
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
|
``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
|
||||||
#====== ======= =============== =============== ===============================
|
``#====== ======= =============== =============== ===============================``
|
||||||
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
|
``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
|
||||||
|
|
||||||
|
|
||||||
This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
|
This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
|
||||||
The last parameter, 600, defines how many seconds into the future the key will
|
The last parameter, 600, defines how many seconds into the future the key will
|
||||||
expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
|
expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
|
||||||
is not specified, nfs.idmap will default to 600 seconds.
|
is not specified, nfs.idmap will default to 600 seconds.
|
||||||
|
|
||||||
id mapper uses for key descriptions:
|
id mapper uses for key descriptions::
|
||||||
|
|
||||||
uid: Find the UID for the given user
|
uid: Find the UID for the given user
|
||||||
gid: Find the GID for the given group
|
gid: Find the GID for the given group
|
||||||
user: Find the user name for the given UID
|
user: Find the user name for the given UID
|
||||||
@@ -45,23 +47,24 @@ You can handle any of these individually, rather than using the generic upcall
|
|||||||
program. If you would like to use your own program for a uid lookup then you
|
program. If you would like to use your own program for a uid lookup then you
|
||||||
would edit your request-key.conf so it look similar to this:
|
would edit your request-key.conf so it look similar to this:
|
||||||
|
|
||||||
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
|
``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
|
||||||
#====== ======= =============== =============== ===============================
|
``#====== ======= =============== =============== ===============================``
|
||||||
create id_resolver uid:* * /some/other/program %k %d 600
|
``create id_resolver uid:* * /some/other/program %k %d 600``
|
||||||
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
|
``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
|
||||||
|
|
||||||
|
|
||||||
Notice that the new line was added above the line for the generic program.
|
Notice that the new line was added above the line for the generic program.
|
||||||
request-key will find the first matching line and corresponding program. In
|
request-key will find the first matching line and corresponding program. In
|
||||||
this case, /some/other/program will handle all uid lookups and
|
this case, /some/other/program will handle all uid lookups and
|
||||||
/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
|
/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
|
||||||
|
|
||||||
See <file:Documentation/security/keys/request-key.rst> for more information
|
See Documentation/security/keys/request-key.rst for more information
|
||||||
about the request-key function.
|
about the request-key function.
|
||||||
|
|
||||||
|
|
||||||
=========
|
|
||||||
nfs.idmap
|
nfs.idmap
|
||||||
=========
|
=========
|
||||||
|
|
||||||
nfs.idmap is designed to be called by request-key, and should not be run "by
|
nfs.idmap is designed to be called by request-key, and should not be run "by
|
||||||
hand". This program takes two arguments, a serialized key and a key
|
hand". This program takes two arguments, a serialized key and a key
|
||||||
description. The serialized key is first converted into a key_serial_t, and
|
description. The serialized key is first converted into a key_serial_t, and
|
292
Documentation/admin-guide/nfs/nfs-rdma.rst
Normal file
292
Documentation/admin-guide/nfs/nfs-rdma.rst
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
===================
|
||||||
|
Setting up NFS/RDMA
|
||||||
|
===================
|
||||||
|
|
||||||
|
:Author:
|
||||||
|
NetApp and Open Grid Computing (May 29, 2008)
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
This document is probably obsolete.
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
This document describes how to install and setup the Linux NFS/RDMA client
|
||||||
|
and server software.
|
||||||
|
|
||||||
|
The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
|
||||||
|
was first included in the following release, Linux 2.6.25.
|
||||||
|
|
||||||
|
In our testing, we have obtained excellent performance results (full 10Gbit
|
||||||
|
wire bandwidth at minimal client CPU) under many workloads. The code passes
|
||||||
|
the full Connectathon test suite and operates over both Infiniband and iWARP
|
||||||
|
RDMA adapters.
|
||||||
|
|
||||||
|
Getting Help
|
||||||
|
============
|
||||||
|
|
||||||
|
If you get stuck, you can ask questions on the
|
||||||
|
nfs-rdma-devel@lists.sourceforge.net mailing list.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
============
|
||||||
|
|
||||||
|
These instructions are a step by step guide to building a machine for
|
||||||
|
use with NFS/RDMA.
|
||||||
|
|
||||||
|
- Install an RDMA device
|
||||||
|
|
||||||
|
Any device supported by the drivers in drivers/infiniband/hw is acceptable.
|
||||||
|
|
||||||
|
Testing has been performed using several Mellanox-based IB cards, the
|
||||||
|
Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
|
||||||
|
|
||||||
|
- Install a Linux distribution and tools
|
||||||
|
|
||||||
|
The first kernel release to contain both the NFS/RDMA client and server was
|
||||||
|
Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
|
||||||
|
Linux kernel release should be installed.
|
||||||
|
|
||||||
|
The procedures described in this document have been tested with
|
||||||
|
distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
|
||||||
|
|
||||||
|
- Install nfs-utils-1.1.2 or greater on the client
|
||||||
|
|
||||||
|
An NFS/RDMA mount point can be obtained by using the mount.nfs command in
|
||||||
|
nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
|
||||||
|
version with support for NFS/RDMA mounts, but for various reasons we
|
||||||
|
recommend using nfs-utils-1.1.2 or greater). To see which version of
|
||||||
|
mount.nfs you are using, type:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ /sbin/mount.nfs -V
|
||||||
|
|
||||||
|
If the version is less than 1.1.2 or the command does not exist,
|
||||||
|
you should install the latest version of nfs-utils.
|
||||||
|
|
||||||
|
Download the latest package from: http://www.kernel.org/pub/linux/utils/nfs
|
||||||
|
|
||||||
|
Uncompress the package and follow the installation instructions.
|
||||||
|
|
||||||
|
If you will not need the idmapper and gssd executables (you do not need
|
||||||
|
these to create an NFS/RDMA enabled mount command), the installation
|
||||||
|
process can be simplified by disabling these features when running
|
||||||
|
configure:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ ./configure --disable-gss --disable-nfsv4
|
||||||
|
|
||||||
|
To build nfs-utils you will need the tcp_wrappers package installed. For
|
||||||
|
more information on this see the package's README and INSTALL files.
|
||||||
|
|
||||||
|
After building the nfs-utils package, there will be a mount.nfs binary in
|
||||||
|
the utils/mount directory. This binary can be used to initiate NFS v2, v3,
|
||||||
|
or v4 mounts. To initiate a v4 mount, the binary must be called
|
||||||
|
mount.nfs4. The standard technique is to create a symlink called
|
||||||
|
mount.nfs4 to mount.nfs.
|
||||||
|
|
||||||
|
This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
|
||||||
|
|
||||||
|
In this location, mount.nfs will be invoked automatically for NFS mounts
|
||||||
|
by the system mount command.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
|
||||||
|
on the NFS client machine. You do not need this specific version of
|
||||||
|
nfs-utils on the server. Furthermore, only the mount.nfs command from
|
||||||
|
nfs-utils-1.1.2 is needed on the client.
|
||||||
|
|
||||||
|
- Install a Linux kernel with NFS/RDMA
|
||||||
|
|
||||||
|
The NFS/RDMA client and server are both included in the mainline Linux
|
||||||
|
kernel version 2.6.25 and later. This and other versions of the Linux
|
||||||
|
kernel can be found at: https://www.kernel.org/pub/linux/kernel/
|
||||||
|
|
||||||
|
Download the sources and place them in an appropriate location.
|
||||||
|
|
||||||
|
- Configure the RDMA stack
|
||||||
|
|
||||||
|
Make sure your kernel configuration has RDMA support enabled. Under
|
||||||
|
Device Drivers -> InfiniBand support, update the kernel configuration
|
||||||
|
to enable InfiniBand support [NOTE: the option name is misleading. Enabling
|
||||||
|
InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
|
||||||
|
|
||||||
|
Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
|
||||||
|
iWARP adapter support (amso, cxgb3, etc.).
|
||||||
|
|
||||||
|
If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
|
||||||
|
|
||||||
|
- Configure the NFS client and server
|
||||||
|
|
||||||
|
Your kernel configuration must also have NFS file system support and/or
|
||||||
|
NFS server support enabled. These and other NFS related configuration
|
||||||
|
options can be found under File Systems -> Network File Systems.
|
||||||
|
|
||||||
|
- Build, install, reboot
|
||||||
|
|
||||||
|
The NFS/RDMA code will be enabled automatically if NFS and RDMA
|
||||||
|
are turned on. The NFS/RDMA client and server are configured via the hidden
|
||||||
|
SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
|
||||||
|
value of SUNRPC_XPRT_RDMA will be:
|
||||||
|
|
||||||
|
#. N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
|
||||||
|
and server will not be built
|
||||||
|
|
||||||
|
#. M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
|
||||||
|
in this case the NFS/RDMA client and server will be built as modules
|
||||||
|
|
||||||
|
#. Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
|
||||||
|
and server will be built into the kernel
|
||||||
|
|
||||||
|
Therefore, if you have followed the steps above and turned no NFS and RDMA,
|
||||||
|
the NFS/RDMA client and server will be built.
|
||||||
|
|
||||||
|
Build a new kernel, install it, boot it.
|
||||||
|
|
||||||
|
Check RDMA and NFS Setup
|
||||||
|
========================
|
||||||
|
|
||||||
|
Before configuring the NFS/RDMA software, it is a good idea to test
|
||||||
|
your new kernel to ensure that the kernel is working correctly.
|
||||||
|
In particular, it is a good idea to verify that the RDMA stack
|
||||||
|
is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
|
||||||
|
is working properly.
|
||||||
|
|
||||||
|
- Check RDMA Setup
|
||||||
|
|
||||||
|
If you built the RDMA components as modules, load them at
|
||||||
|
this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
|
||||||
|
card:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ modprobe ib_mthca
|
||||||
|
$ modprobe ib_ipoib
|
||||||
|
|
||||||
|
If you are using InfiniBand, make sure there is a Subnet Manager (SM)
|
||||||
|
running on the network. If your IB switch has an embedded SM, you can
|
||||||
|
use it. Otherwise, you will need to run an SM, such as OpenSM, on one
|
||||||
|
of your end nodes.
|
||||||
|
|
||||||
|
If an SM is running on your network, you should see the following:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ cat /sys/class/infiniband/driverX/ports/1/state
|
||||||
|
4: ACTIVE
|
||||||
|
|
||||||
|
where driverX is mthca0, ipath5, ehca3, etc.
|
||||||
|
|
||||||
|
To further test the InfiniBand software stack, use IPoIB (this
|
||||||
|
assumes you have two IB hosts named host1 and host2):
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
host1$ ip link set dev ib0 up
|
||||||
|
host1$ ip address add dev ib0 a.b.c.x
|
||||||
|
host2$ ip link set dev ib0 up
|
||||||
|
host2$ ip address add dev ib0 a.b.c.y
|
||||||
|
host1$ ping a.b.c.y
|
||||||
|
host2$ ping a.b.c.x
|
||||||
|
|
||||||
|
For other device types, follow the appropriate procedures.
|
||||||
|
|
||||||
|
- Check NFS Setup
|
||||||
|
|
||||||
|
For the NFS components enabled above (client and/or server),
|
||||||
|
test their functionality over standard Ethernet using TCP/IP or UDP/IP.
|
||||||
|
|
||||||
|
NFS/RDMA Setup
|
||||||
|
==============
|
||||||
|
|
||||||
|
We recommend that you use two machines, one to act as the client and
|
||||||
|
one to act as the server.
|
||||||
|
|
||||||
|
One time configuration:
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
- On the server system, configure the /etc/exports file and start the NFS/RDMA server.
|
||||||
|
|
||||||
|
Exports entries with the following formats have been tested::
|
||||||
|
|
||||||
|
/vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
|
||||||
|
/vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
|
||||||
|
|
||||||
|
The IP address(es) is(are) the client's IPoIB address for an InfiniBand
|
||||||
|
HCA or the client's iWARP address(es) for an RNIC.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
The "insecure" option must be used because the NFS/RDMA client does
|
||||||
|
not use a reserved port.
|
||||||
|
|
||||||
|
Each time a machine boots:
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
- Load and configure the RDMA drivers
|
||||||
|
|
||||||
|
For InfiniBand using a Mellanox adapter:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ modprobe ib_mthca
|
||||||
|
$ modprobe ib_ipoib
|
||||||
|
$ ip li set dev ib0 up
|
||||||
|
$ ip addr add dev ib0 a.b.c.d
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
Please use unique addresses for the client and server!
|
||||||
|
|
||||||
|
- Start the NFS server
|
||||||
|
|
||||||
|
If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
|
||||||
|
kernel config), load the RDMA transport module:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ modprobe svcrdma
|
||||||
|
|
||||||
|
Regardless of how the server was built (module or built-in), start the
|
||||||
|
server:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ /etc/init.d/nfs start
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ service nfs start
|
||||||
|
|
||||||
|
Instruct the server to listen on the RDMA transport:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ echo rdma 20049 > /proc/fs/nfsd/portlist
|
||||||
|
|
||||||
|
- On the client system
|
||||||
|
|
||||||
|
If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
|
||||||
|
kernel config), load the RDMA client module:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ modprobe xprtrdma.ko
|
||||||
|
|
||||||
|
Regardless of how the client was built (module or built-in), use this
|
||||||
|
command to mount the NFS/RDMA server:
|
||||||
|
|
||||||
|
.. code-block:: sh
|
||||||
|
|
||||||
|
$ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt
|
||||||
|
|
||||||
|
To verify that the mount is using RDMA, run "cat /proc/mounts" and check
|
||||||
|
the "proto" field for the given mount.
|
||||||
|
|
||||||
|
Congratulations! You're using NFS/RDMA!
|
@@ -1,5 +1,6 @@
|
|||||||
|
==================================
|
||||||
Administrative interfaces for nfsd
|
Administrative interfaces for nfsd
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
==================================
|
||||||
|
|
||||||
Note that normally these interfaces are used only by the utilities in
|
Note that normally these interfaces are used only by the utilities in
|
||||||
nfs-utils.
|
nfs-utils.
|
||||||
@@ -13,18 +14,16 @@ nfsd/threads.
|
|||||||
Before doing that, NFSD can be told which sockets to listen on by
|
Before doing that, NFSD can be told which sockets to listen on by
|
||||||
writing to nfsd/portlist; that write may be:
|
writing to nfsd/portlist; that write may be:
|
||||||
|
|
||||||
- an ascii-encoded file descriptor, which should refer to a
|
- an ascii-encoded file descriptor, which should refer to a
|
||||||
bound (and listening, for tcp) socket, or
|
bound (and listening, for tcp) socket, or
|
||||||
- "transportname port", where transportname is currently either
|
- "transportname port", where transportname is currently either
|
||||||
"udp", "tcp", or "rdma".
|
"udp", "tcp", or "rdma".
|
||||||
|
|
||||||
If nfsd is started without doing any of these, then it will create one
|
If nfsd is started without doing any of these, then it will create one
|
||||||
udp and one tcp listener at port 2049 (see nfsd_init_socks).
|
udp and one tcp listener at port 2049 (see nfsd_init_socks).
|
||||||
|
|
||||||
On startup, nfsd and lockd grace periods start.
|
On startup, nfsd and lockd grace periods start. nfsd is shut down by a write of
|
||||||
|
0 to nfsd/threads. All locks and state are thrown away at that point.
|
||||||
nfsd is shut down by a write of 0 to nfsd/threads. All locks and state
|
|
||||||
are thrown away at that point.
|
|
||||||
|
|
||||||
Between startup and shutdown, the number of threads may be adjusted up
|
Between startup and shutdown, the number of threads may be adjusted up
|
||||||
or down by additional writes to nfsd/threads or by writes to
|
or down by additional writes to nfsd/threads or by writes to
|
||||||
@@ -34,7 +33,7 @@ For more detail about files under nfsd/ and what they control, see
|
|||||||
fs/nfsd/nfsctl.c; most of them have detailed comments.
|
fs/nfsd/nfsctl.c; most of them have detailed comments.
|
||||||
|
|
||||||
Implementation notes
|
Implementation notes
|
||||||
^^^^^^^^^^^^^^^^^^^^
|
====================
|
||||||
|
|
||||||
Note that the rpc server requires the caller to serialize addition and
|
Note that the rpc server requires the caller to serialize addition and
|
||||||
removal of listening sockets, and startup and shutdown of the server.
|
removal of listening sockets, and startup and shutdown of the server.
|
@@ -1,27 +1,34 @@
|
|||||||
|
===============================================
|
||||||
Mounting the root filesystem via NFS (nfsroot)
|
Mounting the root filesystem via NFS (nfsroot)
|
||||||
===============================================
|
===============================================
|
||||||
|
|
||||||
Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
|
:Authors:
|
||||||
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
|
Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
|
||||||
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
|
|
||||||
Updated 2006 by Horms <horms@verge.net.au>
|
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
|
||||||
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
|
|
||||||
|
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
|
||||||
|
|
||||||
|
Updated 2006 by Horms <horms@verge.net.au>
|
||||||
|
|
||||||
|
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
In order to use a diskless system, such as an X-terminal or printer server
|
In order to use a diskless system, such as an X-terminal or printer server for
|
||||||
for example, it is necessary for the root filesystem to be present on a
|
example, it is necessary for the root filesystem to be present on a non-disk
|
||||||
non-disk device. This may be an initramfs (see Documentation/filesystems/
|
device. This may be an initramfs (see
|
||||||
ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/admin-guide/initrd.rst) or a
|
Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see
|
||||||
filesystem mounted via NFS. The following text describes on how to use NFS
|
Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The
|
||||||
for the root filesystem. For the rest of this text 'client' means the
|
following text describes on how to use NFS for the root filesystem. For the rest
|
||||||
diskless system, and 'server' means the NFS server.
|
of this text 'client' means the diskless system, and 'server' means the NFS
|
||||||
|
server.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1.) Enabling nfsroot capabilities
|
Enabling nfsroot capabilities
|
||||||
-----------------------------
|
=============================
|
||||||
|
|
||||||
In order to use nfsroot, NFS client support needs to be selected as
|
In order to use nfsroot, NFS client support needs to be selected as
|
||||||
built-in during configuration. Once this has been selected, the nfsroot
|
built-in during configuration. Once this has been selected, the nfsroot
|
||||||
@@ -34,8 +41,8 @@ DHCP, BOOTP and RARP is safe.
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
2.) Kernel command line
|
Kernel command line
|
||||||
-------------------
|
===================
|
||||||
|
|
||||||
When the kernel has been loaded by a boot loader (see below) it needs to be
|
When the kernel has been loaded by a boot loader (see below) it needs to be
|
||||||
told what root fs device to use. And in the case of nfsroot, where to find
|
told what root fs device to use. And in the case of nfsroot, where to find
|
||||||
@@ -44,19 +51,17 @@ This can be established using the following kernel command line parameters:
|
|||||||
|
|
||||||
|
|
||||||
root=/dev/nfs
|
root=/dev/nfs
|
||||||
|
|
||||||
This is necessary to enable the pseudo-NFS-device. Note that it's not a
|
This is necessary to enable the pseudo-NFS-device. Note that it's not a
|
||||||
real device but just a synonym to tell the kernel to use NFS instead of
|
real device but just a synonym to tell the kernel to use NFS instead of
|
||||||
a real device.
|
a real device.
|
||||||
|
|
||||||
|
|
||||||
nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
||||||
|
|
||||||
If the `nfsroot' parameter is NOT given on the command line,
|
If the `nfsroot' parameter is NOT given on the command line,
|
||||||
the default "/tftpboot/%s" will be used.
|
the default ``"/tftpboot/%s"`` will be used.
|
||||||
|
|
||||||
<server-ip> Specifies the IP address of the NFS server.
|
<server-ip> Specifies the IP address of the NFS server.
|
||||||
The default address is determined by the `ip' parameter
|
The default address is determined by the ip parameter
|
||||||
(see below). This parameter allows the use of different
|
(see below). This parameter allows the use of different
|
||||||
servers for IP autoconfiguration and NFS.
|
servers for IP autoconfiguration and NFS.
|
||||||
|
|
||||||
@@ -66,7 +71,8 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
|||||||
IP address.
|
IP address.
|
||||||
|
|
||||||
<nfs-options> Standard NFS options. All options are separated by commas.
|
<nfs-options> Standard NFS options. All options are separated by commas.
|
||||||
The following defaults are used:
|
The following defaults are used::
|
||||||
|
|
||||||
port = as given by server portmap daemon
|
port = as given by server portmap daemon
|
||||||
rsize = 4096
|
rsize = 4096
|
||||||
wsize = 4096
|
wsize = 4096
|
||||||
@@ -79,13 +85,11 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
|||||||
flags = hard, nointr, noposix, cto, ac
|
flags = hard, nointr, noposix, cto, ac
|
||||||
|
|
||||||
|
|
||||||
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:<dns0-ip>:<dns1-ip>:<ntp0-ip>
|
||||||
<dns0-ip>:<dns1-ip>:<ntp0-ip>
|
|
||||||
|
|
||||||
This parameter tells the kernel how to configure IP addresses of devices
|
This parameter tells the kernel how to configure IP addresses of devices
|
||||||
and also how to set up the IP routing table. It was originally called
|
and also how to set up the IP routing table. It was originally called
|
||||||
`nfsaddrs', but now the boot-time IP configuration works independently of
|
nfsaddrs, but now the boot-time IP configuration works independently of
|
||||||
NFS, so it was renamed to `ip' and the old name remained as an alias for
|
NFS, so it was renamed to ip and the old name remained as an alias for
|
||||||
compatibility reasons.
|
compatibility reasons.
|
||||||
|
|
||||||
If this parameter is missing from the kernel command line, all fields are
|
If this parameter is missing from the kernel command line, all fields are
|
||||||
@@ -93,17 +97,17 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
|||||||
this means that the kernel tries to configure everything using
|
this means that the kernel tries to configure everything using
|
||||||
autoconfiguration.
|
autoconfiguration.
|
||||||
|
|
||||||
The <autoconf> parameter can appear alone as the value to the `ip'
|
The <autoconf> parameter can appear alone as the value to the ip
|
||||||
parameter (without all the ':' characters before). If the value is
|
parameter (without all the ':' characters before). If the value is
|
||||||
"ip=off" or "ip=none", no autoconfiguration will take place, otherwise
|
"ip=off" or "ip=none", no autoconfiguration will take place, otherwise
|
||||||
autoconfiguration will take place. The most common way to use this
|
autoconfiguration will take place. The most common way to use this
|
||||||
is "ip=dhcp".
|
is "ip=dhcp".
|
||||||
|
|
||||||
<client-ip> IP address of the client.
|
<client-ip> IP address of the client.
|
||||||
|
|
||||||
Default: Determined using autoconfiguration.
|
Default: Determined using autoconfiguration.
|
||||||
|
|
||||||
<server-ip> IP address of the NFS server. If RARP is used to determine
|
<server-ip> IP address of the NFS server.
|
||||||
|
If RARP is used to determine
|
||||||
the client address and this parameter is NOT empty only
|
the client address and this parameter is NOT empty only
|
||||||
replies from the specified server are accepted.
|
replies from the specified server are accepted.
|
||||||
|
|
||||||
@@ -115,19 +119,19 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
|||||||
(see below).
|
(see below).
|
||||||
|
|
||||||
Default: Determined using autoconfiguration.
|
Default: Determined using autoconfiguration.
|
||||||
The address of the autoconfiguration server is used.
|
The address of the autoconfiguration server is used.
|
||||||
|
|
||||||
<gw-ip> IP address of a gateway if the server is on a different subnet.
|
<gw-ip> IP address of a gateway if the server is on a different subnet.
|
||||||
|
|
||||||
Default: Determined using autoconfiguration.
|
Default: Determined using autoconfiguration.
|
||||||
|
|
||||||
<netmask> Netmask for local network interface. If unspecified
|
<netmask> Netmask for local network interface.
|
||||||
the netmask is derived from the client IP address assuming
|
If unspecified the netmask is derived from the client IP address
|
||||||
classful addressing.
|
assuming classful addressing.
|
||||||
|
|
||||||
Default: Determined using autoconfiguration.
|
Default: Determined using autoconfiguration.
|
||||||
|
|
||||||
<hostname> Name of the client. If a '.' character is present, anything
|
<hostname> Name of the client.
|
||||||
|
If a '.' character is present, anything
|
||||||
before the first '.' is used as the client's hostname, and anything
|
before the first '.' is used as the client's hostname, and anything
|
||||||
after it is used as its NIS domain name. May be supplied by
|
after it is used as its NIS domain name. May be supplied by
|
||||||
autoconfiguration, but its absence will not trigger autoconfiguration.
|
autoconfiguration, but its absence will not trigger autoconfiguration.
|
||||||
@@ -138,21 +142,21 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
|||||||
Default: Client IP address is used in ASCII notation.
|
Default: Client IP address is used in ASCII notation.
|
||||||
|
|
||||||
<device> Name of network device to use.
|
<device> Name of network device to use.
|
||||||
|
|
||||||
Default: If the host only has one device, it is used.
|
Default: If the host only has one device, it is used.
|
||||||
Otherwise the device is determined using
|
Otherwise the device is determined using
|
||||||
autoconfiguration. This is done by sending
|
autoconfiguration. This is done by sending
|
||||||
autoconfiguration requests out of all devices,
|
autoconfiguration requests out of all devices,
|
||||||
and using the device that received the first reply.
|
and using the device that received the first reply.
|
||||||
|
|
||||||
<autoconf> Method to use for autoconfiguration. In the case of options
|
<autoconf> Method to use for autoconfiguration.
|
||||||
which specify multiple autoconfiguration protocols,
|
In the case of options
|
||||||
|
which specify multiple autoconfiguration protocols,
|
||||||
requests are sent using all protocols, and the first one
|
requests are sent using all protocols, and the first one
|
||||||
to reply is used.
|
to reply is used.
|
||||||
|
|
||||||
Only autoconfiguration protocols that have been compiled
|
Only autoconfiguration protocols that have been compiled
|
||||||
into the kernel will be used, regardless of the value of
|
into the kernel will be used, regardless of the value of
|
||||||
this option.
|
this option::
|
||||||
|
|
||||||
off or none: don't use autoconfiguration
|
off or none: don't use autoconfiguration
|
||||||
(do static IP assignment instead)
|
(do static IP assignment instead)
|
||||||
@@ -221,7 +225,6 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
|||||||
|
|
||||||
|
|
||||||
nfsrootdebug
|
nfsrootdebug
|
||||||
|
|
||||||
This parameter enables debugging messages to appear in the kernel
|
This parameter enables debugging messages to appear in the kernel
|
||||||
log at boot time so that administrators can verify that the correct
|
log at boot time so that administrators can verify that the correct
|
||||||
NFS mount options, server address, and root path are passed to the
|
NFS mount options, server address, and root path are passed to the
|
||||||
@@ -229,36 +232,32 @@ nfsrootdebug
|
|||||||
|
|
||||||
|
|
||||||
rdinit=<executable file>
|
rdinit=<executable file>
|
||||||
|
|
||||||
To specify which file contains the program that starts system
|
To specify which file contains the program that starts system
|
||||||
initialization, administrators can use this command line parameter.
|
initialization, administrators can use this command line parameter.
|
||||||
The default value of this parameter is "/init". If the specified
|
The default value of this parameter is "/init". If the specified
|
||||||
file exists and the kernel can execute it, root filesystem related
|
file exists and the kernel can execute it, root filesystem related
|
||||||
kernel command line parameters, including `nfsroot=', are ignored.
|
kernel command line parameters, including 'nfsroot=', are ignored.
|
||||||
|
|
||||||
A description of the process of mounting the root file system can be
|
A description of the process of mounting the root file system can be
|
||||||
found in:
|
found in Documentation/driver-api/early-userspace/early_userspace_support.rst
|
||||||
|
|
||||||
Documentation/driver-api/early-userspace/early_userspace_support.rst
|
|
||||||
|
|
||||||
|
|
||||||
|
Boot Loader
|
||||||
|
===========
|
||||||
3.) Boot Loader
|
|
||||||
----------
|
|
||||||
|
|
||||||
To get the kernel into memory different approaches can be used.
|
To get the kernel into memory different approaches can be used.
|
||||||
They depend on various facilities being available:
|
They depend on various facilities being available:
|
||||||
|
|
||||||
|
|
||||||
3.1) Booting from a floppy using syslinux
|
- Booting from a floppy using syslinux
|
||||||
|
|
||||||
When building kernels, an easy way to create a boot floppy that uses
|
When building kernels, an easy way to create a boot floppy that uses
|
||||||
syslinux is to use the zdisk or bzdisk make targets which use zimage
|
syslinux is to use the zdisk or bzdisk make targets which use zimage
|
||||||
and bzimage images respectively. Both targets accept the
|
and bzimage images respectively. Both targets accept the
|
||||||
FDARGS parameter which can be used to set the kernel command line.
|
FDARGS parameter which can be used to set the kernel command line.
|
||||||
|
|
||||||
e.g.
|
e.g::
|
||||||
|
|
||||||
make bzdisk FDARGS="root=/dev/nfs"
|
make bzdisk FDARGS="root=/dev/nfs"
|
||||||
|
|
||||||
Note that the user running this command will need to have
|
Note that the user running this command will need to have
|
||||||
@@ -267,32 +266,36 @@ They depend on various facilities being available:
|
|||||||
For more information on syslinux, including how to create bootdisks
|
For more information on syslinux, including how to create bootdisks
|
||||||
for prebuilt kernels, see http://syslinux.zytor.com/
|
for prebuilt kernels, see http://syslinux.zytor.com/
|
||||||
|
|
||||||
N.B: Previously it was possible to write a kernel directly to
|
.. note::
|
||||||
a floppy using dd, configure the boot device using rdev, and
|
Previously it was possible to write a kernel directly to
|
||||||
boot using the resulting floppy. Linux no longer supports this
|
a floppy using dd, configure the boot device using rdev, and
|
||||||
method of booting.
|
boot using the resulting floppy. Linux no longer supports this
|
||||||
|
method of booting.
|
||||||
|
|
||||||
3.2) Booting from a cdrom using isolinux
|
- Booting from a cdrom using isolinux
|
||||||
|
|
||||||
When building kernels, an easy way to create a bootable cdrom that
|
When building kernels, an easy way to create a bootable cdrom that
|
||||||
uses isolinux is to use the isoimage target which uses a bzimage
|
uses isolinux is to use the isoimage target which uses a bzimage
|
||||||
image. Like zdisk and bzdisk, this target accepts the FDARGS
|
image. Like zdisk and bzdisk, this target accepts the FDARGS
|
||||||
parameter which can be used to set the kernel command line.
|
parameter which can be used to set the kernel command line.
|
||||||
|
|
||||||
e.g.
|
e.g::
|
||||||
|
|
||||||
make isoimage FDARGS="root=/dev/nfs"
|
make isoimage FDARGS="root=/dev/nfs"
|
||||||
|
|
||||||
The resulting iso image will be arch/<ARCH>/boot/image.iso
|
The resulting iso image will be arch/<ARCH>/boot/image.iso
|
||||||
This can be written to a cdrom using a variety of tools including
|
This can be written to a cdrom using a variety of tools including
|
||||||
cdrecord.
|
cdrecord.
|
||||||
|
|
||||||
e.g.
|
e.g::
|
||||||
|
|
||||||
cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
|
cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
|
||||||
|
|
||||||
For more information on isolinux, including how to create bootdisks
|
For more information on isolinux, including how to create bootdisks
|
||||||
for prebuilt kernels, see http://syslinux.zytor.com/
|
for prebuilt kernels, see http://syslinux.zytor.com/
|
||||||
|
|
||||||
3.2) Using LILO
|
- Using LILO
|
||||||
|
|
||||||
When using LILO all the necessary command line parameters may be
|
When using LILO all the necessary command line parameters may be
|
||||||
specified using the 'append=' directive in the LILO configuration
|
specified using the 'append=' directive in the LILO configuration
|
||||||
file.
|
file.
|
||||||
@@ -300,15 +303,19 @@ They depend on various facilities being available:
|
|||||||
However, to use the 'root=' directive you also need to create
|
However, to use the 'root=' directive you also need to create
|
||||||
a dummy root device, which may be removed after LILO is run.
|
a dummy root device, which may be removed after LILO is run.
|
||||||
|
|
||||||
mknod /dev/boot255 c 0 255
|
e.g::
|
||||||
|
|
||||||
|
mknod /dev/boot255 c 0 255
|
||||||
|
|
||||||
For information on configuring LILO, please refer to its documentation.
|
For information on configuring LILO, please refer to its documentation.
|
||||||
|
|
||||||
3.3) Using GRUB
|
- Using GRUB
|
||||||
|
|
||||||
When using GRUB, kernel parameter are simply appended after the kernel
|
When using GRUB, kernel parameter are simply appended after the kernel
|
||||||
specification: kernel <kernel> <parameters>
|
specification: kernel <kernel> <parameters>
|
||||||
|
|
||||||
3.4) Using loadlin
|
- Using loadlin
|
||||||
|
|
||||||
loadlin may be used to boot Linux from a DOS command prompt without
|
loadlin may be used to boot Linux from a DOS command prompt without
|
||||||
requiring a local hard disk to mount as root. This has not been
|
requiring a local hard disk to mount as root. This has not been
|
||||||
thoroughly tested by the authors of this document, but in general
|
thoroughly tested by the authors of this document, but in general
|
||||||
@@ -317,7 +324,8 @@ They depend on various facilities being available:
|
|||||||
|
|
||||||
Please refer to the loadlin documentation for further information.
|
Please refer to the loadlin documentation for further information.
|
||||||
|
|
||||||
3.5) Using a boot ROM
|
- Using a boot ROM
|
||||||
|
|
||||||
This is probably the most elegant way of booting a diskless client.
|
This is probably the most elegant way of booting a diskless client.
|
||||||
With a boot ROM the kernel is loaded using the TFTP protocol. The
|
With a boot ROM the kernel is loaded using the TFTP protocol. The
|
||||||
authors of this document are not aware of any no commercial boot
|
authors of this document are not aware of any no commercial boot
|
||||||
@@ -326,7 +334,8 @@ They depend on various facilities being available:
|
|||||||
etherboot, both of which are available on sunsite.unc.edu, and both
|
etherboot, both of which are available on sunsite.unc.edu, and both
|
||||||
of which contain everything you need to boot a diskless Linux client.
|
of which contain everything you need to boot a diskless Linux client.
|
||||||
|
|
||||||
3.6) Using pxelinux
|
- Using pxelinux
|
||||||
|
|
||||||
Pxelinux may be used to boot linux using the PXE boot loader
|
Pxelinux may be used to boot linux using the PXE boot loader
|
||||||
which is present on many modern network cards.
|
which is present on many modern network cards.
|
||||||
|
|
||||||
@@ -342,8 +351,8 @@ They depend on various facilities being available:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
4.) Credits
|
Credits
|
||||||
-------
|
=======
|
||||||
|
|
||||||
The nfsroot code in the kernel and the RARP support have been written
|
The nfsroot code in the kernel and the RARP support have been written
|
||||||
by Gero Kuhlmann <gero@gkminix.han.de>.
|
by Gero Kuhlmann <gero@gkminix.han.de>.
|
@@ -1,4 +1,6 @@
|
|||||||
|
===================================
|
||||||
pNFS block layout server user guide
|
pNFS block layout server user guide
|
||||||
|
===================================
|
||||||
|
|
||||||
The Linux NFS server now supports the pNFS block layout extension. In this
|
The Linux NFS server now supports the pNFS block layout extension. In this
|
||||||
case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
|
case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
|
||||||
@@ -22,16 +24,19 @@ If the nfsd server needs to fence a non-responding client it calls
|
|||||||
/sbin/nfsd-recall-failed with the first argument set to the IP address of
|
/sbin/nfsd-recall-failed with the first argument set to the IP address of
|
||||||
the client, and the second argument set to the device node without the /dev
|
the client, and the second argument set to the device node without the /dev
|
||||||
prefix for the file system to be fenced. Below is an example file that shows
|
prefix for the file system to be fenced. Below is an example file that shows
|
||||||
how to translate the device into a serial number from SCSI EVPD 0x80:
|
how to translate the device into a serial number from SCSI EVPD 0x80::
|
||||||
|
|
||||||
cat > /sbin/nfsd-recall-failed << EOF
|
cat > /sbin/nfsd-recall-failed << EOF
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
CLIENT="$1"
|
.. code-block:: sh
|
||||||
DEV="/dev/$2"
|
|
||||||
EVPD=`sg_inq --page=0x80 ${DEV} | \
|
|
||||||
grep "Unit serial number:" | \
|
|
||||||
awk -F ': ' '{print $2}'`
|
|
||||||
|
|
||||||
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
|
#!/bin/sh
|
||||||
EOF
|
|
||||||
|
CLIENT="$1"
|
||||||
|
DEV="/dev/$2"
|
||||||
|
EVPD=`sg_inq --page=0x80 ${DEV} | \
|
||||||
|
grep "Unit serial number:" | \
|
||||||
|
awk -F ': ' '{print $2}'`
|
||||||
|
|
||||||
|
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
|
||||||
|
EOF
|
@@ -1,4 +1,5 @@
|
|||||||
|
|
||||||
|
==================================
|
||||||
pNFS SCSI layout server user guide
|
pNFS SCSI layout server user guide
|
||||||
==================================
|
==================================
|
||||||
|
|
@@ -506,6 +506,9 @@ object corresponding to it, as follows:
|
|||||||
``disable``
|
``disable``
|
||||||
Whether or not this idle state is disabled.
|
Whether or not this idle state is disabled.
|
||||||
|
|
||||||
|
``default_status``
|
||||||
|
The default status of this state, "enabled" or "disabled".
|
||||||
|
|
||||||
``latency``
|
``latency``
|
||||||
Exit latency of the idle state in microseconds.
|
Exit latency of the idle state in microseconds.
|
||||||
|
|
||||||
@@ -629,16 +632,16 @@ class priority list and destroyed. If that happens, the priority list mechanism
|
|||||||
will be used, again, to determine the new effective value for the whole list
|
will be used, again, to determine the new effective value for the whole list
|
||||||
and that value will become the new real constraint.
|
and that value will become the new real constraint.
|
||||||
|
|
||||||
In turn, for each CPU there is only one resume latency PM QoS request
|
In turn, for each CPU there is one resume latency PM QoS request associated with
|
||||||
associated with the :file:`power/pm_qos_resume_latency_us` file under
|
the :file:`power/pm_qos_resume_latency_us` file under
|
||||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
|
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
|
||||||
this single PM QoS request to be updated regardless of which user space
|
this single PM QoS request to be updated regardless of which user space
|
||||||
process does that. In other words, this PM QoS request is shared by the entire
|
process does that. In other words, this PM QoS request is shared by the entire
|
||||||
user space, so access to the file associated with it needs to be arbitrated
|
user space, so access to the file associated with it needs to be arbitrated
|
||||||
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
|
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
|
||||||
practice is to pin a process to the CPU in question and let it use the
|
practice is to pin a process to the CPU in question and let it use the
|
||||||
``sysfs`` interface to control the resume latency constraint for it.] It
|
``sysfs`` interface to control the resume latency constraint for it.] It is
|
||||||
still only is a request, however. It is a member of a priority list used to
|
still only a request, however. It is an entry in a priority list used to
|
||||||
determine the effective value to be set as the resume latency constraint for the
|
determine the effective value to be set as the resume latency constraint for the
|
||||||
CPU in question every time the list of requests is updated this way or another
|
CPU in question every time the list of requests is updated this way or another
|
||||||
(there may be other requests coming from kernel code in that list).
|
(there may be other requests coming from kernel code in that list).
|
||||||
|
268
Documentation/admin-guide/pm/intel_idle.rst
Normal file
268
Documentation/admin-guide/pm/intel_idle.rst
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
|
==============================================
|
||||||
|
``intel_idle`` CPU Idle Time Management Driver
|
||||||
|
==============================================
|
||||||
|
|
||||||
|
:Copyright: |copy| 2020 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
|
General Information
|
||||||
|
===================
|
||||||
|
|
||||||
|
``intel_idle`` is a part of the
|
||||||
|
:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
|
||||||
|
(``CPUIdle``). It is the default CPU idle time management driver for the
|
||||||
|
Nehalem and later generations of Intel processors, but the level of support for
|
||||||
|
a particular processor model in it depends on whether or not it recognizes that
|
||||||
|
processor model and may also depend on information coming from the platform
|
||||||
|
firmware. [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
|
||||||
|
works in general, so this is the time to get familiar with :doc:`cpuidle` if you
|
||||||
|
have not done that yet.]
|
||||||
|
|
||||||
|
``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
|
||||||
|
logical CPU executing it is idle and so it may be possible to put some of the
|
||||||
|
processor's functional blocks into low-power states. That instruction takes two
|
||||||
|
arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
|
||||||
|
first of which, referred to as a *hint*, can be used by the processor to
|
||||||
|
determine what can be done (for details refer to Intel Software Developer’s
|
||||||
|
Manual [1]_). Accordingly, ``intel_idle`` refuses to work with processors in
|
||||||
|
which the support for the ``MWAIT`` instruction has been disabled (for example,
|
||||||
|
via the platform firmware configuration menu) or which do not support that
|
||||||
|
instruction at all.
|
||||||
|
|
||||||
|
``intel_idle`` is not modular, so it cannot be unloaded, which means that the
|
||||||
|
only way to pass early-configuration-time parameters to it is via the kernel
|
||||||
|
command line.
|
||||||
|
|
||||||
|
|
||||||
|
.. _intel-idle-enumeration-of-states:
|
||||||
|
|
||||||
|
Enumeration of Idle States
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Each ``MWAIT`` hint value is interpreted by the processor as a license to
|
||||||
|
reconfigure itself in a certain way in order to save energy. The processor
|
||||||
|
configurations (with reduced power draw) resulting from that are referred to
|
||||||
|
as C-states (in the ACPI terminology) or idle states. The list of meaningful
|
||||||
|
``MWAIT`` hint values and idle states (i.e. low-power configurations of the
|
||||||
|
processor) corresponding to them depends on the processor model and it may also
|
||||||
|
depend on the configuration of the platform.
|
||||||
|
|
||||||
|
In order to create a list of available idle states required by the ``CPUIdle``
|
||||||
|
subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
|
||||||
|
``intel_idle`` can use two sources of information: static tables of idle states
|
||||||
|
for different processor models included in the driver itself and the ACPI tables
|
||||||
|
of the system. The former are always used if the processor model at hand is
|
||||||
|
recognized by ``intel_idle`` and the latter are used if that is required for
|
||||||
|
the given processor model (which is the case for all server processor models
|
||||||
|
recognized by ``intel_idle``) or if the processor model is not recognized.
|
||||||
|
[There is a module parameter that can be used to make the driver use the ACPI
|
||||||
|
tables with any processor model recognized by it; see
|
||||||
|
`below <intel-idle-parameters_>`_.]
|
||||||
|
|
||||||
|
If the ACPI tables are going to be used for building the list of available idle
|
||||||
|
states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
|
||||||
|
objects corresponding to the CPUs in the system (refer to the ACPI specification
|
||||||
|
[2]_ for the description of ``_CST`` and its output package). Because the
|
||||||
|
``CPUIdle`` subsystem expects that the list of idle states supplied by the
|
||||||
|
driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
|
||||||
|
registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
|
||||||
|
driver looks for the first ``_CST`` object returning at least one valid idle
|
||||||
|
state description and such that all of the idle states included in its return
|
||||||
|
package are of the FFH (Functional Fixed Hardware) type, which means that the
|
||||||
|
``MWAIT`` instruction is expected to be used to tell the processor that it can
|
||||||
|
enter one of them. The return package of that ``_CST`` is then assumed to be
|
||||||
|
applicable to all of the other CPUs in the system and the idle state
|
||||||
|
descriptions extracted from it are stored in a preliminary list of idle states
|
||||||
|
coming from the ACPI tables. [This step is skipped if ``intel_idle`` is
|
||||||
|
configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
|
||||||
|
|
||||||
|
Next, the first (index 0) entry in the list of available idle states is
|
||||||
|
initialized to represent a "polling idle state" (a pseudo-idle state in which
|
||||||
|
the target CPU continuously fetches and executes instructions), and the
|
||||||
|
subsequent (real) idle state entries are populated as follows.
|
||||||
|
|
||||||
|
If the processor model at hand is recognized by ``intel_idle``, there is a
|
||||||
|
(static) table of idle state descriptions for it in the driver. In that case,
|
||||||
|
the "internal" table is the primary source of information on idle states and the
|
||||||
|
information from it is copied to the final list of available idle states. If
|
||||||
|
using the ACPI tables for the enumeration of idle states is not required
|
||||||
|
(depending on the processor model), all of the listed idle state are enabled by
|
||||||
|
default (so all of them will be taken into consideration by ``CPUIdle``
|
||||||
|
governors during CPU idle state selection). Otherwise, some of the listed idle
|
||||||
|
states may not be enabled by default if there are no matching entries in the
|
||||||
|
preliminary list of idle states coming from the ACPI tables. In that case user
|
||||||
|
space still can enable them later (on a per-CPU basis) with the help of
|
||||||
|
the ``disable`` idle state attribute in ``sysfs`` (see
|
||||||
|
:ref:`idle-states-representation` in :doc:`cpuidle`). This basically means that
|
||||||
|
the idle states "known" to the driver may not be enabled by default if they have
|
||||||
|
not been exposed by the platform firmware (through the ACPI tables).
|
||||||
|
|
||||||
|
If the given processor model is not recognized by ``intel_idle``, but it
|
||||||
|
supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
|
||||||
|
tables is used for building the final list that will be supplied to the
|
||||||
|
``CPUIdle`` core during driver registration. For each idle state in that list,
|
||||||
|
the description, ``MWAIT`` hint and exit latency are copied to the corresponding
|
||||||
|
entry in the final list of idle states. The name of the idle state represented
|
||||||
|
by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
|
||||||
|
"CX_ACPI", where X is the index of that idle state in the final list (note that
|
||||||
|
the minimum value of X is 1, because 0 is reserved for the "polling" state), and
|
||||||
|
its target residency is based on the exit latency value. Specifically, for
|
||||||
|
C1-type idle states the exit latency value is also used as the target residency
|
||||||
|
(for compatibility with the majority of the "internal" tables of idle states for
|
||||||
|
various processor models recognized by ``intel_idle``) and for the other idle
|
||||||
|
state types (C2 and C3) the target residency value is 3 times the exit latency
|
||||||
|
(again, that is because it reflects the target residency to exit latency ratio
|
||||||
|
in the majority of cases for the processor models recognized by ``intel_idle``).
|
||||||
|
All of the idle states in the final list are enabled by default in this case.
|
||||||
|
|
||||||
|
|
||||||
|
.. _intel-idle-initialization:
|
||||||
|
|
||||||
|
Initialization
|
||||||
|
==============
|
||||||
|
|
||||||
|
The initialization of ``intel_idle`` starts with checking if the kernel command
|
||||||
|
line options forbid the use of the ``MWAIT`` instruction. If that is the case,
|
||||||
|
an error code is returned right away.
|
||||||
|
|
||||||
|
The next step is to check whether or not the processor model is known to the
|
||||||
|
driver, which determines the idle states enumeration method (see
|
||||||
|
`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
|
||||||
|
supports ``MWAIT`` (the initialization fails if that is not the case). Then,
|
||||||
|
the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
|
||||||
|
driver initialization fails if the level of support is not as expected (for
|
||||||
|
example, if the total number of ``MWAIT`` substates returned is 0).
|
||||||
|
|
||||||
|
Next, if the driver is not configured to ignore the ACPI tables (see
|
||||||
|
`below <intel-idle-parameters_>`_), the idle states information provided by the
|
||||||
|
platform firmware is extracted from them.
|
||||||
|
|
||||||
|
Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
|
||||||
|
available idle states is created as explained
|
||||||
|
`above <intel-idle-enumeration-of-states_>`_.
|
||||||
|
|
||||||
|
Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
|
||||||
|
as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
|
||||||
|
for configuring individual CPUs is registered via cpuhp_setup_state(), which
|
||||||
|
(among other things) causes the callback routine to be invoked for all of the
|
||||||
|
CPUs present in the system at that time (each CPU executes its own instance of
|
||||||
|
the callback routine). That routine registers a ``CPUIdle`` device for the CPU
|
||||||
|
running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
|
||||||
|
optionally performs some CPU-specific initialization actions that may be
|
||||||
|
required for the given processor model.
|
||||||
|
|
||||||
|
|
||||||
|
.. _intel-idle-parameters:
|
||||||
|
|
||||||
|
Kernel Command Line Options and Module Parameters
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
The *x86* architecture support code recognizes three kernel command line
|
||||||
|
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
|
||||||
|
and ``idle=nomwait``. If any of them is present in the kernel command line, the
|
||||||
|
``MWAIT`` instruction is not allowed to be used, so the initialization of
|
||||||
|
``intel_idle`` will fail.
|
||||||
|
|
||||||
|
Apart from that there are four module parameters recognized by ``intel_idle``
|
||||||
|
itself that can be set via the kernel command line (they cannot be updated via
|
||||||
|
sysfs, so that is the only way to change their values).
|
||||||
|
|
||||||
|
The ``max_cstate`` parameter value is the maximum idle state index in the list
|
||||||
|
of idle states supplied to the ``CPUIdle`` core during the registration of the
|
||||||
|
driver. It is also the maximum number of regular (non-polling) idle states that
|
||||||
|
can be used by ``intel_idle``, so the enumeration of idle states is terminated
|
||||||
|
after finding that number of usable idle states (the other idle states that
|
||||||
|
potentially might have been used if ``max_cstate`` had been greater are not
|
||||||
|
taken into consideration at all). Setting ``max_cstate`` can prevent
|
||||||
|
``intel_idle`` from exposing idle states that are regarded as "too deep" for
|
||||||
|
some reason to the ``CPUIdle`` core, but it does so by making them effectively
|
||||||
|
invisible until the system is shut down and started again which may not always
|
||||||
|
be desirable. In practice, it is only really necessary to do that if the idle
|
||||||
|
states in question cannot be enabled during system startup, because in the
|
||||||
|
working state of the system the CPU power management quality of service (PM
|
||||||
|
QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
|
||||||
|
even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
|
||||||
|
Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
|
||||||
|
|
||||||
|
The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
|
||||||
|
if the kernel has been configured with ACPI support) can be set to make the
|
||||||
|
driver ignore the system's ACPI tables entirely or use them for all of the
|
||||||
|
recognized processor models, respectively (they both are unset by default and
|
||||||
|
``use_acpi`` has no effect if ``no_acpi`` is set).
|
||||||
|
|
||||||
|
The value of the ``states_off`` module parameter (0 by default) represents a
|
||||||
|
list of idle states to be disabled by default in the form of a bitmask.
|
||||||
|
|
||||||
|
Namely, the positions of the bits that are set in the ``states_off`` value are
|
||||||
|
the indices of idle states to be disabled by default (as reflected by the names
|
||||||
|
of the corresponding idle state directories in ``sysfs``, :file:`state0`,
|
||||||
|
:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
|
||||||
|
idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
|
||||||
|
|
||||||
|
For example, if ``states_off`` is equal to 3, the driver will disable idle
|
||||||
|
states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
|
||||||
|
disabled by default and so on (bit positions beyond the maximum idle state index
|
||||||
|
are ignored).
|
||||||
|
|
||||||
|
The idle states disabled this way can be enabled (on a per-CPU basis) from user
|
||||||
|
space via ``sysfs``.
|
||||||
|
|
||||||
|
|
||||||
|
.. _intel-idle-core-and-package-idle-states:
|
||||||
|
|
||||||
|
Core and Package Levels of Idle States
|
||||||
|
======================================
|
||||||
|
|
||||||
|
Typically, in a processor supporting the ``MWAIT`` instruction there are (at
|
||||||
|
least) two levels of idle states (or C-states). One level, referred to as
|
||||||
|
"core C-states", covers individual cores in the processor, whereas the other
|
||||||
|
level, referred to as "package C-states", covers the entire processor package
|
||||||
|
and it may also involve other components of the system (GPUs, memory
|
||||||
|
controllers, I/O hubs etc.).
|
||||||
|
|
||||||
|
Some of the ``MWAIT`` hint values allow the processor to use core C-states only
|
||||||
|
(most importantly, that is the case for the ``MWAIT`` hint value corresponding
|
||||||
|
to the ``C1`` idle state), but the majority of them give it a license to put
|
||||||
|
the target core (i.e. the core containing the logical CPU executing ``MWAIT``
|
||||||
|
with the given hint value) into a specific core C-state and then (if possible)
|
||||||
|
to enter a specific package C-state at the deeper level. For example, the
|
||||||
|
``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
|
||||||
|
put the target core into the low-power state referred to as "core ``C3``" (or
|
||||||
|
``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
|
||||||
|
have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
|
||||||
|
representing a deeper idle state), and in addition to that (in the majority of
|
||||||
|
cases) it gives the processor a license to put the entire package (possibly
|
||||||
|
including some non-CPU components such as a GPU or a memory controller) into the
|
||||||
|
low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
|
||||||
|
all of the cores have gone into the ``CC3`` state and (possibly) some additional
|
||||||
|
conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
|
||||||
|
be required to be in a certain GPU-specific low-power state for ``PC3`` to be
|
||||||
|
reachable).
|
||||||
|
|
||||||
|
As a rule, there is no simple way to make the processor use core C-states only
|
||||||
|
if the conditions for entering the corresponding package C-states are met, so
|
||||||
|
the logical CPU executing ``MWAIT`` with a hint value that is not core-level
|
||||||
|
only (like for ``C1``) must always assume that this may cause the processor to
|
||||||
|
enter a package C-state. [That is why the exit latency and target residency
|
||||||
|
values corresponding to the majority of ``MWAIT`` hint values in the "internal"
|
||||||
|
tables of idle states in ``intel_idle`` reflect the properties of package
|
||||||
|
C-states.] If using package C-states is not desirable at all, either
|
||||||
|
:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
|
||||||
|
``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
|
||||||
|
restrict the range of permissible idle states to the ones with core-level only
|
||||||
|
``MWAIT`` hint values (like ``C1``).
|
||||||
|
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*,
|
||||||
|
https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
|
||||||
|
|
||||||
|
.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
|
||||||
|
https://uefi.org/specifications
|
@@ -153,8 +153,11 @@ for the given CPU architecture includes the low-level code for system resume.
|
|||||||
Basic ``sysfs`` Interfaces for System Suspend and Hibernation
|
Basic ``sysfs`` Interfaces for System Suspend and Hibernation
|
||||||
=============================================================
|
=============================================================
|
||||||
|
|
||||||
The following files located in the :file:`/sys/power/` directory can be used by
|
The power management subsystem provides userspace with a unified ``sysfs``
|
||||||
user space for sleep states control.
|
interface for system sleep regardless of the underlying system architecture or
|
||||||
|
platform. That interface is located in the :file:`/sys/power/` directory
|
||||||
|
(assuming that ``sysfs`` is mounted at :file:`/sys`) and it consists of the
|
||||||
|
following attributes (files):
|
||||||
|
|
||||||
``state``
|
``state``
|
||||||
This file contains a list of strings representing sleep states supported
|
This file contains a list of strings representing sleep states supported
|
||||||
@@ -162,9 +165,9 @@ user space for sleep states control.
|
|||||||
to start a transition of the system into the sleep state represented by
|
to start a transition of the system into the sleep state represented by
|
||||||
that string.
|
that string.
|
||||||
|
|
||||||
In particular, the strings "disk", "freeze" and "standby" represent the
|
In particular, the "disk", "freeze" and "standby" strings represent the
|
||||||
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
|
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
|
||||||
:ref:`standby <standby>` sleep states, respectively. The string "mem"
|
:ref:`standby <standby>` sleep states, respectively. The "mem" string
|
||||||
is interpreted in accordance with the contents of the ``mem_sleep`` file
|
is interpreted in accordance with the contents of the ``mem_sleep`` file
|
||||||
described below.
|
described below.
|
||||||
|
|
||||||
@@ -177,7 +180,7 @@ user space for sleep states control.
|
|||||||
associated with the "mem" string in the ``state`` file described above.
|
associated with the "mem" string in the ``state`` file described above.
|
||||||
|
|
||||||
The strings that may be present in this file are "s2idle", "shallow"
|
The strings that may be present in this file are "s2idle", "shallow"
|
||||||
and "deep". The string "s2idle" always represents :ref:`suspend-to-idle
|
and "deep". The "s2idle" string always represents :ref:`suspend-to-idle
|
||||||
<s2idle>` and, by convention, "shallow" and "deep" represent
|
<s2idle>` and, by convention, "shallow" and "deep" represent
|
||||||
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
|
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
|
||||||
respectively.
|
respectively.
|
||||||
@@ -185,15 +188,17 @@ user space for sleep states control.
|
|||||||
Writing one of the listed strings into this file causes the system
|
Writing one of the listed strings into this file causes the system
|
||||||
suspend variant represented by it to be associated with the "mem" string
|
suspend variant represented by it to be associated with the "mem" string
|
||||||
in the ``state`` file. The string representing the suspend variant
|
in the ``state`` file. The string representing the suspend variant
|
||||||
currently associated with the "mem" string in the ``state`` file
|
currently associated with the "mem" string in the ``state`` file is
|
||||||
is listed in square brackets.
|
shown in square brackets.
|
||||||
|
|
||||||
If the kernel does not support system suspend, this file is not present.
|
If the kernel does not support system suspend, this file is not present.
|
||||||
|
|
||||||
``disk``
|
``disk``
|
||||||
This file contains a list of strings representing different operations
|
This file controls the operating mode of hibernation (Suspend-to-Disk).
|
||||||
that can be carried out after the hibernation image has been saved. The
|
Specifically, it tells the kernel what to do after creating a
|
||||||
possible options are as follows:
|
hibernation image.
|
||||||
|
|
||||||
|
Reading from it returns a list of supported options encoded as:
|
||||||
|
|
||||||
``platform``
|
``platform``
|
||||||
Put the system into a special low-power state (e.g. ACPI S4) to
|
Put the system into a special low-power state (e.g. ACPI S4) to
|
||||||
@@ -201,6 +206,11 @@ user space for sleep states control.
|
|||||||
platform firmware to take a simplified initialization path after
|
platform firmware to take a simplified initialization path after
|
||||||
wakeup.
|
wakeup.
|
||||||
|
|
||||||
|
It is only available if the platform provides a special
|
||||||
|
mechanism to put the system to sleep after creating a
|
||||||
|
hibernation image (platforms with ACPI do that as a rule, for
|
||||||
|
example).
|
||||||
|
|
||||||
``shutdown``
|
``shutdown``
|
||||||
Power off the system.
|
Power off the system.
|
||||||
|
|
||||||
@@ -214,22 +224,53 @@ user space for sleep states control.
|
|||||||
the hibernation image and continue. Otherwise, use the image
|
the hibernation image and continue. Otherwise, use the image
|
||||||
to restore the previous state of the system.
|
to restore the previous state of the system.
|
||||||
|
|
||||||
|
It is available if system suspend is supported.
|
||||||
|
|
||||||
``test_resume``
|
``test_resume``
|
||||||
Diagnostic operation. Load the image as though the system had
|
Diagnostic operation. Load the image as though the system had
|
||||||
just woken up from hibernation and the currently running kernel
|
just woken up from hibernation and the currently running kernel
|
||||||
instance was a restore kernel and follow up with full system
|
instance was a restore kernel and follow up with full system
|
||||||
resume.
|
resume.
|
||||||
|
|
||||||
Writing one of the listed strings into this file causes the option
|
Writing one of the strings listed above into this file causes the option
|
||||||
represented by it to be selected.
|
represented by it to be selected.
|
||||||
|
|
||||||
The currently selected option is shown in square brackets which means
|
The currently selected option is shown in square brackets, which means
|
||||||
that the operation represented by it will be carried out after creating
|
that the operation represented by it will be carried out after creating
|
||||||
and saving the image next time hibernation is triggered by writing
|
and saving the image when hibernation is triggered by writing ``disk``
|
||||||
``disk`` to :file:`/sys/power/state`.
|
to :file:`/sys/power/state`.
|
||||||
|
|
||||||
If the kernel does not support hibernation, this file is not present.
|
If the kernel does not support hibernation, this file is not present.
|
||||||
|
|
||||||
|
``image_size``
|
||||||
|
This file controls the size of hibernation images.
|
||||||
|
|
||||||
|
It can be written a string representing a non-negative integer that will
|
||||||
|
be used as a best-effort upper limit of the image size, in bytes. The
|
||||||
|
hibernation core will do its best to ensure that the image size will not
|
||||||
|
exceed that number, but if that turns out to be impossible to achieve, a
|
||||||
|
hibernation image will still be created and its size will be as small as
|
||||||
|
possible. In particular, writing '0' to this file causes the size of
|
||||||
|
hibernation images to be minimum.
|
||||||
|
|
||||||
|
Reading from it returns the current image size limit, which is set to
|
||||||
|
around 2/5 of the available RAM size by default.
|
||||||
|
|
||||||
|
``pm_trace``
|
||||||
|
This file controls the "PM trace" mechanism saving the last suspend
|
||||||
|
or resume event point in the RTC memory across reboots. It helps to
|
||||||
|
debug hard lockups or reboots due to device driver failures that occur
|
||||||
|
during system suspend or resume (which is more common) more effectively.
|
||||||
|
|
||||||
|
If it contains "1", the fingerprint of each suspend/resume event point
|
||||||
|
in turn will be stored in the RTC memory (overwriting the actual RTC
|
||||||
|
information), so it will survive a system crash if one occurs right
|
||||||
|
after storing it and it can be used later to identify the driver that
|
||||||
|
caused the crash to happen.
|
||||||
|
|
||||||
|
It contains "0" by default, which may be changed to "1" by writing a
|
||||||
|
string representing a nonzero integer into it.
|
||||||
|
|
||||||
According to the above, there are two ways to make the system go into the
|
According to the above, there are two ways to make the system go into the
|
||||||
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
|
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
|
||||||
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
|
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
|
||||||
@@ -244,6 +285,7 @@ system go into the :ref:`suspend-to-RAM <s2ram>` state (write "deep" into
|
|||||||
The default suspend variant (ie. the one to be used without writing anything
|
The default suspend variant (ie. the one to be used without writing anything
|
||||||
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
|
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
|
||||||
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
|
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
|
||||||
by the value of the "mem_sleep_default" parameter in the kernel command line.
|
by the value of the ``mem_sleep_default`` parameter in the kernel command line.
|
||||||
On some ACPI-based systems, depending on the information in the ACPI tables, the
|
On some systems with ACPI, depending on the information in the ACPI tables, the
|
||||||
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported.
|
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported in
|
||||||
|
principle.
|
||||||
|
@@ -8,6 +8,7 @@ Working-State Power Management
|
|||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
cpuidle
|
cpuidle
|
||||||
|
intel_idle
|
||||||
cpufreq
|
cpufreq
|
||||||
intel_pstate
|
intel_pstate
|
||||||
intel_epb
|
intel_epb
|
||||||
|
@@ -1,6 +1,28 @@
|
|||||||
=============
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
Thunderbolt
|
|
||||||
=============
|
======================
|
||||||
|
USB4 and Thunderbolt
|
||||||
|
======================
|
||||||
|
USB4 is the public specification based on Thunderbolt 3 protocol with
|
||||||
|
some differences at the register level among other things. Connection
|
||||||
|
manager is an entity running on the host router (host controller)
|
||||||
|
responsible for enumerating routers and establishing tunnels. A
|
||||||
|
connection manager can be implemented either in firmware or software.
|
||||||
|
Typically PCs come with a firmware connection manager for Thunderbolt 3
|
||||||
|
and early USB4 capable systems. Apple systems on the other hand use
|
||||||
|
software connection manager and the later USB4 compliant devices follow
|
||||||
|
the suit.
|
||||||
|
|
||||||
|
The Linux Thunderbolt driver supports both and can detect at runtime which
|
||||||
|
connection manager implementation is to be used. To be on the safe side the
|
||||||
|
software connection manager in Linux also advertises security level
|
||||||
|
``user`` which means PCIe tunneling is disabled by default. The
|
||||||
|
documentation below applies to both implementations with the exception that
|
||||||
|
the software connection manager only supports ``user`` security level and
|
||||||
|
is expected to be accompanied with an IOMMU based DMA protection.
|
||||||
|
|
||||||
|
Security levels and how to use them
|
||||||
|
-----------------------------------
|
||||||
The interface presented here is not meant for end users. Instead there
|
The interface presented here is not meant for end users. Instead there
|
||||||
should be a userspace tool that handles all the low-level details, keeps
|
should be a userspace tool that handles all the low-level details, keeps
|
||||||
a database of the authorized devices and prompts users for new connections.
|
a database of the authorized devices and prompts users for new connections.
|
||||||
@@ -18,8 +40,6 @@ This will authorize all devices automatically when they appear. However,
|
|||||||
keep in mind that this bypasses the security levels and makes the system
|
keep in mind that this bypasses the security levels and makes the system
|
||||||
vulnerable to DMA attacks.
|
vulnerable to DMA attacks.
|
||||||
|
|
||||||
Security levels and how to use them
|
|
||||||
-----------------------------------
|
|
||||||
Starting with Intel Falcon Ridge Thunderbolt controller there are 4
|
Starting with Intel Falcon Ridge Thunderbolt controller there are 4
|
||||||
security levels available. Intel Titan Ridge added one more security level
|
security levels available. Intel Titan Ridge added one more security level
|
||||||
(usbonly). The reason for these is the fact that the connected devices can
|
(usbonly). The reason for these is the fact that the connected devices can
|
||||||
|
@@ -92,6 +92,12 @@ the Microchip website: http://www.microchip.com.
|
|||||||
|
|
||||||
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
|
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
|
||||||
|
|
||||||
|
- sam9x60
|
||||||
|
|
||||||
|
* Datasheet
|
||||||
|
|
||||||
|
http://ww1.microchip.com/downloads/en/DeviceDoc/SAM9X60-Data-Sheet-DS60001579A.pdf
|
||||||
|
|
||||||
* ARM Cortex-A5 based SoCs
|
* ARM Cortex-A5 based SoCs
|
||||||
- sama5d3 family
|
- sama5d3 family
|
||||||
|
|
||||||
|
@@ -129,7 +129,7 @@ this logic.
|
|||||||
|
|
||||||
As a single binary will need to support both 48-bit and 52-bit VA
|
As a single binary will need to support both 48-bit and 52-bit VA
|
||||||
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
|
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
|
||||||
also must be sized large enought to accommodate a fixed PAGE_OFFSET.
|
also must be sized large enough to accommodate a fixed PAGE_OFFSET.
|
||||||
|
|
||||||
Most code in the kernel should not need to consider the VA_BITS, for
|
Most code in the kernel should not need to consider the VA_BITS, for
|
||||||
code that does need to know the VA size the variables are
|
code that does need to know the VA size the variables are
|
||||||
|
@@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending
|
|||||||
how the user addresses are used by the kernel:
|
how the user addresses are used by the kernel:
|
||||||
|
|
||||||
1. User addresses not accessed by the kernel but used for address space
|
1. User addresses not accessed by the kernel but used for address space
|
||||||
management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use
|
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
|
||||||
of valid tagged pointers in this context is always allowed.
|
tagged pointers in this context is allowed with the exception of
|
||||||
|
``brk()``, ``mmap()`` and the ``new_address`` argument to
|
||||||
|
``mremap()`` as these have the potential to alias with existing
|
||||||
|
user addresses.
|
||||||
|
|
||||||
|
NOTE: This behaviour changed in v5.6 and so some earlier kernels may
|
||||||
|
incorrectly accept valid tagged pointers for the ``brk()``,
|
||||||
|
``mmap()`` and ``mremap()`` system calls.
|
||||||
|
|
||||||
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
|
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
|
||||||
relaxation is disabled by default and the application thread needs to
|
relaxation is disabled by default and the application thread needs to
|
||||||
|
@@ -73,10 +73,11 @@ The new macros are prefixed with the ``SYM_`` prefix and can be divided into
|
|||||||
three main groups:
|
three main groups:
|
||||||
|
|
||||||
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
|
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
|
||||||
standard C calling conventions, i.e. the stack contains a return address at
|
standard C calling conventions. For example, on x86, this means that the
|
||||||
the predefined place and a return from the function can happen in a
|
stack contains a return address at the predefined place and a return from
|
||||||
standard way. When frame pointers are enabled, save/restore of frame
|
the function can happen in a standard way. When frame pointers are enabled,
|
||||||
pointer shall happen at the start/end of a function, respectively, too.
|
save/restore of frame pointer shall happen at the start/end of a function,
|
||||||
|
respectively, too.
|
||||||
|
|
||||||
Checking tools like ``objtool`` should ensure such marked functions conform
|
Checking tools like ``objtool`` should ensure such marked functions conform
|
||||||
to these rules. The tools can also easily annotate these functions with
|
to these rules. The tools can also easily annotate these functions with
|
||||||
|
@@ -47,7 +47,7 @@ Having a real iterator, and making biovecs immutable, has a number of
|
|||||||
advantages:
|
advantages:
|
||||||
|
|
||||||
* Before, iterating over bios was very awkward when you weren't processing
|
* Before, iterating over bios was very awkward when you weren't processing
|
||||||
exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
|
exactly one bvec at a time - for example, bio_copy_data() in block/bio.c,
|
||||||
which copies the contents of one bio into another. Because the biovecs
|
which copies the contents of one bio into another. Because the biovecs
|
||||||
wouldn't necessarily be the same size, the old code was tricky convoluted -
|
wouldn't necessarily be the same size, the old code was tricky convoluted -
|
||||||
it had to walk two different bios at the same time, keeping both bi_idx and
|
it had to walk two different bios at the same time, keeping both bi_idx and
|
||||||
|
@@ -31,6 +31,7 @@ Core utilities
|
|||||||
generic-radix-tree
|
generic-radix-tree
|
||||||
memory-allocation
|
memory-allocation
|
||||||
mm-api
|
mm-api
|
||||||
|
pin_user_pages
|
||||||
gfp_mask-from-fs-io
|
gfp_mask-from-fs-io
|
||||||
timekeeping
|
timekeeping
|
||||||
boot-time-mm
|
boot-time-mm
|
||||||
@@ -39,6 +40,8 @@ Core utilities
|
|||||||
../RCU/index
|
../RCU/index
|
||||||
gcc-plugins
|
gcc-plugins
|
||||||
symbol-namespaces
|
symbol-namespaces
|
||||||
|
padata
|
||||||
|
ioctl
|
||||||
|
|
||||||
|
|
||||||
Interfaces for kernel debugging
|
Interfaces for kernel debugging
|
||||||
|
253
Documentation/core-api/ioctl.rst
Normal file
253
Documentation/core-api/ioctl.rst
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
======================
|
||||||
|
ioctl based interfaces
|
||||||
|
======================
|
||||||
|
|
||||||
|
ioctl() is the most common way for applications to interface
|
||||||
|
with device drivers. It is flexible and easily extended by adding new
|
||||||
|
commands and can be passed through character devices, block devices as
|
||||||
|
well as sockets and other special file descriptors.
|
||||||
|
|
||||||
|
However, it is also very easy to get ioctl command definitions wrong,
|
||||||
|
and hard to fix them later without breaking existing applications,
|
||||||
|
so this documentation tries to help developers get it right.
|
||||||
|
|
||||||
|
Command number definitions
|
||||||
|
==========================
|
||||||
|
|
||||||
|
The command number, or request number, is the second argument passed to
|
||||||
|
the ioctl system call. While this can be any 32-bit number that uniquely
|
||||||
|
identifies an action for a particular driver, there are a number of
|
||||||
|
conventions around defining them.
|
||||||
|
|
||||||
|
``include/uapi/asm-generic/ioctl.h`` provides four macros for defining
|
||||||
|
ioctl commands that follow modern conventions: ``_IO``, ``_IOR``,
|
||||||
|
``_IOW``, and ``_IOWR``. These should be used for all new commands,
|
||||||
|
with the correct parameters:
|
||||||
|
|
||||||
|
_IO/_IOR/_IOW/_IOWR
|
||||||
|
The macro name specifies how the argument will be used. It may be a
|
||||||
|
pointer to data to be passed into the kernel (_IOW), out of the kernel
|
||||||
|
(_IOR), or both (_IOWR). _IO can indicate either commands with no
|
||||||
|
argument or those passing an integer value instead of a pointer.
|
||||||
|
It is recommended to only use _IO for commands without arguments,
|
||||||
|
and use pointers for passing data.
|
||||||
|
|
||||||
|
type
|
||||||
|
An 8-bit number, often a character literal, specific to a subsystem
|
||||||
|
or driver, and listed in :doc:`../userspace-api/ioctl/ioctl-number`
|
||||||
|
|
||||||
|
nr
|
||||||
|
An 8-bit number identifying the specific command, unique for a give
|
||||||
|
value of 'type'
|
||||||
|
|
||||||
|
data_type
|
||||||
|
The name of the data type pointed to by the argument, the command number
|
||||||
|
encodes the ``sizeof(data_type)`` value in a 13-bit or 14-bit integer,
|
||||||
|
leading to a limit of 8191 bytes for the maximum size of the argument.
|
||||||
|
Note: do not pass sizeof(data_type) type into _IOR/_IOW/IOWR, as that
|
||||||
|
will lead to encoding sizeof(sizeof(data_type)), i.e. sizeof(size_t).
|
||||||
|
_IO does not have a data_type parameter.
|
||||||
|
|
||||||
|
|
||||||
|
Interface versions
|
||||||
|
==================
|
||||||
|
|
||||||
|
Some subsystems use version numbers in data structures to overload
|
||||||
|
commands with different interpretations of the argument.
|
||||||
|
|
||||||
|
This is generally a bad idea, since changes to existing commands tend
|
||||||
|
to break existing applications.
|
||||||
|
|
||||||
|
A better approach is to add a new ioctl command with a new number. The
|
||||||
|
old command still needs to be implemented in the kernel for compatibility,
|
||||||
|
but this can be a wrapper around the new implementation.
|
||||||
|
|
||||||
|
Return code
|
||||||
|
===========
|
||||||
|
|
||||||
|
ioctl commands can return negative error codes as documented in errno(3);
|
||||||
|
these get turned into errno values in user space. On success, the return
|
||||||
|
code should be zero. It is also possible but not recommended to return
|
||||||
|
a positive 'long' value.
|
||||||
|
|
||||||
|
When the ioctl callback is called with an unknown command number, the
|
||||||
|
handler returns either -ENOTTY or -ENOIOCTLCMD, which also results in
|
||||||
|
-ENOTTY being returned from the system call. Some subsystems return
|
||||||
|
-ENOSYS or -EINVAL here for historic reasons, but this is wrong.
|
||||||
|
|
||||||
|
Prior to Linux 5.5, compat_ioctl handlers were required to return
|
||||||
|
-ENOIOCTLCMD in order to use the fallback conversion into native
|
||||||
|
commands. As all subsystems are now responsible for handling compat
|
||||||
|
mode themselves, this is no longer needed, but it may be important to
|
||||||
|
consider when backporting bug fixes to older kernels.
|
||||||
|
|
||||||
|
Timestamps
|
||||||
|
==========
|
||||||
|
|
||||||
|
Traditionally, timestamps and timeout values are passed as ``struct
|
||||||
|
timespec`` or ``struct timeval``, but these are problematic because of
|
||||||
|
incompatible definitions of these structures in user space after the
|
||||||
|
move to 64-bit time_t.
|
||||||
|
|
||||||
|
The ``struct __kernel_timespec`` type can be used instead to be embedded
|
||||||
|
in other data structures when separate second/nanosecond values are
|
||||||
|
desired, or passed to user space directly. This is still not ideal though,
|
||||||
|
as the structure matches neither the kernel's timespec64 nor the user
|
||||||
|
space timespec exactly. The get_timespec64() and put_timespec64() helper
|
||||||
|
functions can be used to ensure that the layout remains compatible with
|
||||||
|
user space and the padding is treated correctly.
|
||||||
|
|
||||||
|
As it is cheap to convert seconds to nanoseconds, but the opposite
|
||||||
|
requires an expensive 64-bit division, a simple __u64 nanosecond value
|
||||||
|
can be simpler and more efficient.
|
||||||
|
|
||||||
|
Timeout values and timestamps should ideally use CLOCK_MONOTONIC time,
|
||||||
|
as returned by ktime_get_ns() or ktime_get_ts64(). Unlike
|
||||||
|
CLOCK_REALTIME, this makes the timestamps immune from jumping backwards
|
||||||
|
or forwards due to leap second adjustments and clock_settime() calls.
|
||||||
|
|
||||||
|
ktime_get_real_ns() can be used for CLOCK_REALTIME timestamps that
|
||||||
|
need to be persistent across a reboot or between multiple machines.
|
||||||
|
|
||||||
|
32-bit compat mode
|
||||||
|
==================
|
||||||
|
|
||||||
|
In order to support 32-bit user space running on a 64-bit machine, each
|
||||||
|
subsystem or driver that implements an ioctl callback handler must also
|
||||||
|
implement the corresponding compat_ioctl handler.
|
||||||
|
|
||||||
|
As long as all the rules for data structures are followed, this is as
|
||||||
|
easy as setting the .compat_ioctl pointer to a helper function such as
|
||||||
|
compat_ptr_ioctl() or blkdev_compat_ptr_ioctl().
|
||||||
|
|
||||||
|
compat_ptr()
|
||||||
|
------------
|
||||||
|
|
||||||
|
On the s390 architecture, 31-bit user space has ambiguous representations
|
||||||
|
for data pointers, with the upper bit being ignored. When running such
|
||||||
|
a process in compat mode, the compat_ptr() helper must be used to
|
||||||
|
clear the upper bit of a compat_uptr_t and turn it into a valid 64-bit
|
||||||
|
pointer. On other architectures, this macro only performs a cast to a
|
||||||
|
``void __user *`` pointer.
|
||||||
|
|
||||||
|
In an compat_ioctl() callback, the last argument is an unsigned long,
|
||||||
|
which can be interpreted as either a pointer or a scalar depending on
|
||||||
|
the command. If it is a scalar, then compat_ptr() must not be used, to
|
||||||
|
ensure that the 64-bit kernel behaves the same way as a 32-bit kernel
|
||||||
|
for arguments with the upper bit set.
|
||||||
|
|
||||||
|
The compat_ptr_ioctl() helper can be used in place of a custom
|
||||||
|
compat_ioctl file operation for drivers that only take arguments that
|
||||||
|
are pointers to compatible data structures.
|
||||||
|
|
||||||
|
Structure layout
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Compatible data structures have the same layout on all architectures,
|
||||||
|
avoiding all problematic members:
|
||||||
|
|
||||||
|
* ``long`` and ``unsigned long`` are the size of a register, so
|
||||||
|
they can be either 32-bit or 64-bit wide and cannot be used in portable
|
||||||
|
data structures. Fixed-length replacements are ``__s32``, ``__u32``,
|
||||||
|
``__s64`` and ``__u64``.
|
||||||
|
|
||||||
|
* Pointers have the same problem, in addition to requiring the
|
||||||
|
use of compat_ptr(). The best workaround is to use ``__u64``
|
||||||
|
in place of pointers, which requires a cast to ``uintptr_t`` in user
|
||||||
|
space, and the use of u64_to_user_ptr() in the kernel to convert
|
||||||
|
it back into a user pointer.
|
||||||
|
|
||||||
|
* On the x86-32 (i386) architecture, the alignment of 64-bit variables
|
||||||
|
is only 32-bit, but they are naturally aligned on most other
|
||||||
|
architectures including x86-64. This means a structure like::
|
||||||
|
|
||||||
|
struct foo {
|
||||||
|
__u32 a;
|
||||||
|
__u64 b;
|
||||||
|
__u32 c;
|
||||||
|
};
|
||||||
|
|
||||||
|
has four bytes of padding between a and b on x86-64, plus another four
|
||||||
|
bytes of padding at the end, but no padding on i386, and it needs a
|
||||||
|
compat_ioctl conversion handler to translate between the two formats.
|
||||||
|
|
||||||
|
To avoid this problem, all structures should have their members
|
||||||
|
naturally aligned, or explicit reserved fields added in place of the
|
||||||
|
implicit padding. The ``pahole`` tool can be used for checking the
|
||||||
|
alignment.
|
||||||
|
|
||||||
|
* On ARM OABI user space, structures are padded to multiples of 32-bit,
|
||||||
|
making some structs incompatible with modern EABI kernels if they
|
||||||
|
do not end on a 32-bit boundary.
|
||||||
|
|
||||||
|
* On the m68k architecture, struct members are not guaranteed to have an
|
||||||
|
alignment greater than 16-bit, which is a problem when relying on
|
||||||
|
implicit padding.
|
||||||
|
|
||||||
|
* Bitfields and enums generally work as one would expect them to,
|
||||||
|
but some properties of them are implementation-defined, so it is better
|
||||||
|
to avoid them completely in ioctl interfaces.
|
||||||
|
|
||||||
|
* ``char`` members can be either signed or unsigned, depending on
|
||||||
|
the architecture, so the __u8 and __s8 types should be used for 8-bit
|
||||||
|
integer values, though char arrays are clearer for fixed-length strings.
|
||||||
|
|
||||||
|
Information leaks
|
||||||
|
=================
|
||||||
|
|
||||||
|
Uninitialized data must not be copied back to user space, as this can
|
||||||
|
cause an information leak, which can be used to defeat kernel address
|
||||||
|
space layout randomization (KASLR), helping in an attack.
|
||||||
|
|
||||||
|
For this reason (and for compat support) it is best to avoid any
|
||||||
|
implicit padding in data structures. Where there is implicit padding
|
||||||
|
in an existing structure, kernel drivers must be careful to fully
|
||||||
|
initialize an instance of the structure before copying it to user
|
||||||
|
space. This is usually done by calling memset() before assigning to
|
||||||
|
individual members.
|
||||||
|
|
||||||
|
Subsystem abstractions
|
||||||
|
======================
|
||||||
|
|
||||||
|
While some device drivers implement their own ioctl function, most
|
||||||
|
subsystems implement the same command for multiple drivers. Ideally the
|
||||||
|
subsystem has an .ioctl() handler that copies the arguments from and
|
||||||
|
to user space, passing them into subsystem specific callback functions
|
||||||
|
through normal kernel pointers.
|
||||||
|
|
||||||
|
This helps in various ways:
|
||||||
|
|
||||||
|
* Applications written for one driver are more likely to work for
|
||||||
|
another one in the same subsystem if there are no subtle differences
|
||||||
|
in the user space ABI.
|
||||||
|
|
||||||
|
* The complexity of user space access and data structure layout is done
|
||||||
|
in one place, reducing the potential for implementation bugs.
|
||||||
|
|
||||||
|
* It is more likely to be reviewed by experienced developers
|
||||||
|
that can spot problems in the interface when the ioctl is shared
|
||||||
|
between multiple drivers than when it is only used in a single driver.
|
||||||
|
|
||||||
|
Alternatives to ioctl
|
||||||
|
=====================
|
||||||
|
|
||||||
|
There are many cases in which ioctl is not the best solution for a
|
||||||
|
problem. Alternatives include:
|
||||||
|
|
||||||
|
* System calls are a better choice for a system-wide feature that
|
||||||
|
is not tied to a physical device or constrained by the file system
|
||||||
|
permissions of a character device node
|
||||||
|
|
||||||
|
* netlink is the preferred way of configuring any network related
|
||||||
|
objects through sockets.
|
||||||
|
|
||||||
|
* debugfs is used for ad-hoc interfaces for debugging functionality
|
||||||
|
that does not need to be exposed as a stable interface to applications.
|
||||||
|
|
||||||
|
* sysfs is a good way to expose the state of an in-kernel object
|
||||||
|
that is not tied to a file descriptor.
|
||||||
|
|
||||||
|
* configfs can be used for more complex configuration than sysfs
|
||||||
|
|
||||||
|
* A custom file system can provide extra flexibility with a simple
|
||||||
|
user interface but adds a lot of complexity to the implementation.
|
169
Documentation/core-api/padata.rst
Normal file
169
Documentation/core-api/padata.rst
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=======================================
|
||||||
|
The padata parallel execution mechanism
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
:Date: December 2019
|
||||||
|
|
||||||
|
Padata is a mechanism by which the kernel can farm jobs out to be done in
|
||||||
|
parallel on multiple CPUs while retaining their ordering. It was developed for
|
||||||
|
use with the IPsec code, which needs to be able to perform encryption and
|
||||||
|
decryption on large numbers of packets without reordering those packets. The
|
||||||
|
crypto developers made a point of writing padata in a sufficiently general
|
||||||
|
fashion that it could be put to other uses as well.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
Initializing
|
||||||
|
------------
|
||||||
|
|
||||||
|
The first step in using padata is to set up a padata_instance structure for
|
||||||
|
overall control of how jobs are to be run::
|
||||||
|
|
||||||
|
#include <linux/padata.h>
|
||||||
|
|
||||||
|
struct padata_instance *padata_alloc_possible(const char *name);
|
||||||
|
|
||||||
|
'name' simply identifies the instance.
|
||||||
|
|
||||||
|
There are functions for enabling and disabling the instance::
|
||||||
|
|
||||||
|
int padata_start(struct padata_instance *pinst);
|
||||||
|
void padata_stop(struct padata_instance *pinst);
|
||||||
|
|
||||||
|
These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
|
||||||
|
not set, other functions will refuse to work. padata_start() returns zero on
|
||||||
|
success (flag set) or -EINVAL if the padata cpumask contains no active CPU
|
||||||
|
(flag not set). padata_stop() clears the flag and blocks until the padata
|
||||||
|
instance is unused.
|
||||||
|
|
||||||
|
Finally, complete padata initialization by allocating a padata_shell::
|
||||||
|
|
||||||
|
struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
|
||||||
|
|
||||||
|
A padata_shell is used to submit a job to padata and allows a series of such
|
||||||
|
jobs to be serialized independently. A padata_instance may have one or more
|
||||||
|
padata_shells associated with it, each allowing a separate series of jobs.
|
||||||
|
|
||||||
|
Modifying cpumasks
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The CPUs used to run jobs can be changed in two ways, programatically with
|
||||||
|
padata_set_cpumask() or via sysfs. The former is defined::
|
||||||
|
|
||||||
|
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||||
|
cpumask_var_t cpumask);
|
||||||
|
|
||||||
|
Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a
|
||||||
|
parallel cpumask describes which processors will be used to execute jobs
|
||||||
|
submitted to this instance in parallel and a serial cpumask defines which
|
||||||
|
processors are allowed to be used as the serialization callback processor.
|
||||||
|
cpumask specifies the new cpumask to use.
|
||||||
|
|
||||||
|
There may be sysfs files for an instance's cpumasks. For example, pcrypt's
|
||||||
|
live in /sys/kernel/pcrypt/<instance-name>. Within an instance's directory
|
||||||
|
there are two files, parallel_cpumask and serial_cpumask, and either cpumask
|
||||||
|
may be changed by echoing a bitmask into the file, for example::
|
||||||
|
|
||||||
|
echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask
|
||||||
|
|
||||||
|
Reading one of these files shows the user-supplied cpumask, which may be
|
||||||
|
different from the 'usable' cpumask.
|
||||||
|
|
||||||
|
Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks
|
||||||
|
and the 'usable' cpumasks. (Each pair consists of a parallel and a serial
|
||||||
|
cpumask.) The user-supplied cpumasks default to all possible CPUs on instance
|
||||||
|
allocation and may be changed as above. The usable cpumasks are always a
|
||||||
|
subset of the user-supplied cpumasks and contain only the online CPUs in the
|
||||||
|
user-supplied masks; these are the cpumasks padata actually uses. So it is
|
||||||
|
legal to supply a cpumask to padata that contains offline CPUs. Once an
|
||||||
|
offline CPU in the user-supplied cpumask comes online, padata is going to use
|
||||||
|
it.
|
||||||
|
|
||||||
|
Changing the CPU masks are expensive operations, so it should not be done with
|
||||||
|
great frequency.
|
||||||
|
|
||||||
|
Running A Job
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Actually submitting work to the padata instance requires the creation of a
|
||||||
|
padata_priv structure, which represents one job::
|
||||||
|
|
||||||
|
struct padata_priv {
|
||||||
|
/* Other stuff here... */
|
||||||
|
void (*parallel)(struct padata_priv *padata);
|
||||||
|
void (*serial)(struct padata_priv *padata);
|
||||||
|
};
|
||||||
|
|
||||||
|
This structure will almost certainly be embedded within some larger
|
||||||
|
structure specific to the work to be done. Most of its fields are private to
|
||||||
|
padata, but the structure should be zeroed at initialisation time, and the
|
||||||
|
parallel() and serial() functions should be provided. Those functions will
|
||||||
|
be called in the process of getting the work done as we will see
|
||||||
|
momentarily.
|
||||||
|
|
||||||
|
The submission of the job is done with::
|
||||||
|
|
||||||
|
int padata_do_parallel(struct padata_shell *ps,
|
||||||
|
struct padata_priv *padata, int *cb_cpu);
|
||||||
|
|
||||||
|
The ps and padata structures must be set up as described above; cb_cpu
|
||||||
|
points to the preferred CPU to be used for the final callback when the job is
|
||||||
|
done; it must be in the current instance's CPU mask (if not the cb_cpu pointer
|
||||||
|
is updated to point to the CPU actually chosen). The return value from
|
||||||
|
padata_do_parallel() is zero on success, indicating that the job is in
|
||||||
|
progress. -EBUSY means that somebody, somewhere else is messing with the
|
||||||
|
instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the
|
||||||
|
serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped
|
||||||
|
instance.
|
||||||
|
|
||||||
|
Each job submitted to padata_do_parallel() will, in turn, be passed to
|
||||||
|
exactly one call to the above-mentioned parallel() function, on one CPU, so
|
||||||
|
true parallelism is achieved by submitting multiple jobs. parallel() runs with
|
||||||
|
software interrupts disabled and thus cannot sleep. The parallel()
|
||||||
|
function gets the padata_priv structure pointer as its lone parameter;
|
||||||
|
information about the actual work to be done is probably obtained by using
|
||||||
|
container_of() to find the enclosing structure.
|
||||||
|
|
||||||
|
Note that parallel() has no return value; the padata subsystem assumes that
|
||||||
|
parallel() will take responsibility for the job from this point. The job
|
||||||
|
need not be completed during this call, but, if parallel() leaves work
|
||||||
|
outstanding, it should be prepared to be called again with a new job before
|
||||||
|
the previous one completes.
|
||||||
|
|
||||||
|
Serializing Jobs
|
||||||
|
----------------
|
||||||
|
|
||||||
|
When a job does complete, parallel() (or whatever function actually finishes
|
||||||
|
the work) should inform padata of the fact with a call to::
|
||||||
|
|
||||||
|
void padata_do_serial(struct padata_priv *padata);
|
||||||
|
|
||||||
|
At some point in the future, padata_do_serial() will trigger a call to the
|
||||||
|
serial() function in the padata_priv structure. That call will happen on
|
||||||
|
the CPU requested in the initial call to padata_do_parallel(); it, too, is
|
||||||
|
run with local software interrupts disabled.
|
||||||
|
Note that this call may be deferred for a while since the padata code takes
|
||||||
|
pains to ensure that jobs are completed in the order in which they were
|
||||||
|
submitted.
|
||||||
|
|
||||||
|
Destroying
|
||||||
|
----------
|
||||||
|
|
||||||
|
Cleaning up a padata instance predictably involves calling the three free
|
||||||
|
functions that correspond to the allocation in reverse::
|
||||||
|
|
||||||
|
void padata_free_shell(struct padata_shell *ps);
|
||||||
|
void padata_stop(struct padata_instance *pinst);
|
||||||
|
void padata_free(struct padata_instance *pinst);
|
||||||
|
|
||||||
|
It is the user's responsibility to ensure all outstanding jobs are complete
|
||||||
|
before any of the above are called.
|
||||||
|
|
||||||
|
Interface
|
||||||
|
=========
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/padata.h
|
||||||
|
.. kernel-doc:: kernel/padata.c
|
232
Documentation/core-api/pin_user_pages.rst
Normal file
232
Documentation/core-api/pin_user_pages.rst
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
====================================================
|
||||||
|
pin_user_pages() and related calls
|
||||||
|
====================================================
|
||||||
|
|
||||||
|
.. contents:: :local:
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
This document describes the following functions::
|
||||||
|
|
||||||
|
pin_user_pages()
|
||||||
|
pin_user_pages_fast()
|
||||||
|
pin_user_pages_remote()
|
||||||
|
|
||||||
|
Basic description of FOLL_PIN
|
||||||
|
=============================
|
||||||
|
|
||||||
|
FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*()
|
||||||
|
("gup") family of functions. FOLL_PIN has significant interactions and
|
||||||
|
interdependencies with FOLL_LONGTERM, so both are covered here.
|
||||||
|
|
||||||
|
FOLL_PIN is internal to gup, meaning that it should not appear at the gup call
|
||||||
|
sites. This allows the associated wrapper functions (pin_user_pages*() and
|
||||||
|
others) to set the correct combination of these flags, and to check for problems
|
||||||
|
as well.
|
||||||
|
|
||||||
|
FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites.
|
||||||
|
This is in order to avoid creating a large number of wrapper functions to cover
|
||||||
|
all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
|
||||||
|
pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
|
||||||
|
that's a natural dividing line, and a good point to make separate wrapper calls.
|
||||||
|
In other words, use pin_user_pages*() for DMA-pinned pages, and
|
||||||
|
get_user_pages*() for other cases. There are four cases described later on in
|
||||||
|
this document, to further clarify that concept.
|
||||||
|
|
||||||
|
FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
|
||||||
|
multiple threads and call sites are free to pin the same struct pages, via both
|
||||||
|
FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the
|
||||||
|
other, not the struct page(s).
|
||||||
|
|
||||||
|
The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN
|
||||||
|
uses a different reference counting technique.
|
||||||
|
|
||||||
|
FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is,
|
||||||
|
FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.
|
||||||
|
|
||||||
|
Which flags are set by each wrapper
|
||||||
|
===================================
|
||||||
|
|
||||||
|
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
|
||||||
|
flags the caller provides. The caller is required to pass in a non-null struct
|
||||||
|
pages* array, and the function then pin pages by incrementing each by a special
|
||||||
|
value. For now, that value is +1, just like get_user_pages*().::
|
||||||
|
|
||||||
|
Function
|
||||||
|
--------
|
||||||
|
pin_user_pages FOLL_PIN is always set internally by this function.
|
||||||
|
pin_user_pages_fast FOLL_PIN is always set internally by this function.
|
||||||
|
pin_user_pages_remote FOLL_PIN is always set internally by this function.
|
||||||
|
|
||||||
|
For these get_user_pages*() functions, FOLL_GET might not even be specified.
|
||||||
|
Behavior is a little more complex than above. If FOLL_GET was *not* specified,
|
||||||
|
but the caller passed in a non-null struct pages* array, then the function
|
||||||
|
sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount
|
||||||
|
of each page by +1.::
|
||||||
|
|
||||||
|
Function
|
||||||
|
--------
|
||||||
|
get_user_pages FOLL_GET is sometimes set internally by this function.
|
||||||
|
get_user_pages_fast FOLL_GET is sometimes set internally by this function.
|
||||||
|
get_user_pages_remote FOLL_GET is sometimes set internally by this function.
|
||||||
|
|
||||||
|
Tracking dma-pinned pages
|
||||||
|
=========================
|
||||||
|
|
||||||
|
Some of the key design constraints, and solutions, for tracking dma-pinned
|
||||||
|
pages:
|
||||||
|
|
||||||
|
* An actual reference count, per struct page, is required. This is because
|
||||||
|
multiple processes may pin and unpin a page.
|
||||||
|
|
||||||
|
* False positives (reporting that a page is dma-pinned, when in fact it is not)
|
||||||
|
are acceptable, but false negatives are not.
|
||||||
|
|
||||||
|
* struct page may not be increased in size for this, and all fields are already
|
||||||
|
used.
|
||||||
|
|
||||||
|
* Given the above, we can overload the page->_refcount field by using, sort of,
|
||||||
|
the upper bits in that field for a dma-pinned count. "Sort of", means that,
|
||||||
|
rather than dividing page->_refcount into bit fields, we simple add a medium-
|
||||||
|
large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to
|
||||||
|
page->_refcount. This provides fuzzy behavior: if a page has get_page() called
|
||||||
|
on it 1024 times, then it will appear to have a single dma-pinned count.
|
||||||
|
And again, that's acceptable.
|
||||||
|
|
||||||
|
This also leads to limitations: there are only 31-10==21 bits available for a
|
||||||
|
counter that increments 10 bits at a time.
|
||||||
|
|
||||||
|
TODO: for 1GB and larger huge pages, this is cutting it close. That's because
|
||||||
|
when pin_user_pages() follows such pages, it increments the head page by "1"
|
||||||
|
(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
|
||||||
|
pin_user_pages()) for each tail page. So if you have a 1GB huge page:
|
||||||
|
|
||||||
|
* There are 256K (18 bits) worth of 4 KB tail pages.
|
||||||
|
* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
|
||||||
|
10 bits at a time)
|
||||||
|
* There are 21 - 18 == 3 bits available to count. Except that there aren't,
|
||||||
|
because you need to allow for a few normal get_page() calls on the head page,
|
||||||
|
as well. Fortunately, the approach of using addition, rather than "hard"
|
||||||
|
bitfields, within page->_refcount, allows for sharing these bits gracefully.
|
||||||
|
But we're still looking at about 8 references.
|
||||||
|
|
||||||
|
This, however, is a missing feature more than anything else, because it's easily
|
||||||
|
solved by addressing an obvious inefficiency in the original get_user_pages()
|
||||||
|
approach of retrieving pages: stop treating all the pages as if they were
|
||||||
|
PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
|
||||||
|
this, so some work is required. Once that's in place, this limitation mostly
|
||||||
|
disappears from view, because there will be ample refcounting range available.
|
||||||
|
|
||||||
|
* Callers must specifically request "dma-pinned tracking of pages". In other
|
||||||
|
words, just calling get_user_pages() will not suffice; a new set of functions,
|
||||||
|
pin_user_page() and related, must be used.
|
||||||
|
|
||||||
|
FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags
|
||||||
|
==========================================================
|
||||||
|
|
||||||
|
Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing
|
||||||
|
these categories:
|
||||||
|
|
||||||
|
CASE 1: Direct IO (DIO)
|
||||||
|
-----------------------
|
||||||
|
There are GUP references to pages that are serving
|
||||||
|
as DIO buffers. These buffers are needed for a relatively short time (so they
|
||||||
|
are not "long term"). No special synchronization with page_mkclean() or
|
||||||
|
munmap() is provided. Therefore, flags to set at the call site are: ::
|
||||||
|
|
||||||
|
FOLL_PIN
|
||||||
|
|
||||||
|
...but rather than setting FOLL_PIN directly, call sites should use one of
|
||||||
|
the pin_user_pages*() routines that set FOLL_PIN.
|
||||||
|
|
||||||
|
CASE 2: RDMA
|
||||||
|
------------
|
||||||
|
There are GUP references to pages that are serving as DMA
|
||||||
|
buffers. These buffers are needed for a long time ("long term"). No special
|
||||||
|
synchronization with page_mkclean() or munmap() is provided. Therefore, flags
|
||||||
|
to set at the call site are: ::
|
||||||
|
|
||||||
|
FOLL_PIN | FOLL_LONGTERM
|
||||||
|
|
||||||
|
NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's
|
||||||
|
because DAX pages do not have a separate page cache, and so "pinning" implies
|
||||||
|
locking down file system blocks, which is not (yet) supported in that way.
|
||||||
|
|
||||||
|
CASE 3: Hardware with page faulting support
|
||||||
|
-------------------------------------------
|
||||||
|
Here, a well-written driver doesn't normally need to pin pages at all. However,
|
||||||
|
if the driver does choose to do so, it can register MMU notifiers for the range,
|
||||||
|
and will be called back upon invalidation. Either way (avoiding page pinning, or
|
||||||
|
using MMU notifiers to unpin upon request), there is proper synchronization with
|
||||||
|
both filesystem and mm (page_mkclean(), munmap(), etc).
|
||||||
|
|
||||||
|
Therefore, neither flag needs to be set.
|
||||||
|
|
||||||
|
In this case, ideally, neither get_user_pages() nor pin_user_pages() should be
|
||||||
|
called. Instead, the software should be written so that it does not pin pages.
|
||||||
|
This allows mm and filesystems to operate more efficiently and reliably.
|
||||||
|
|
||||||
|
CASE 4: Pinning for struct page manipulation only
|
||||||
|
-------------------------------------------------
|
||||||
|
Here, normal GUP calls are sufficient, so neither flag needs to be set.
|
||||||
|
|
||||||
|
page_dma_pinned(): the whole point of pinning
|
||||||
|
=============================================
|
||||||
|
|
||||||
|
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
|
||||||
|
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
|
||||||
|
(and file system writeback code in general) to make informed decisions about
|
||||||
|
what to do when a page cannot be unmapped due to such pins.
|
||||||
|
|
||||||
|
What to do in those cases is the subject of a years-long series of discussions
|
||||||
|
and debates (see the References at the end of this document). It's a TODO item
|
||||||
|
here: fill in the details once that's worked out. Meanwhile, it's safe to say
|
||||||
|
that having this available: ::
|
||||||
|
|
||||||
|
static inline bool page_dma_pinned(struct page *page)
|
||||||
|
|
||||||
|
...is a prerequisite to solving the long-running gup+DMA problem.
|
||||||
|
|
||||||
|
Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM
|
||||||
|
===================================================================
|
||||||
|
|
||||||
|
Another way of thinking about these flags is as a progression of restrictions:
|
||||||
|
FOLL_GET is for struct page manipulation, without affecting the data that the
|
||||||
|
struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for
|
||||||
|
short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is
|
||||||
|
a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more
|
||||||
|
restrictive case that has FOLL_PIN as a prerequisite: this is for pages that
|
||||||
|
will be pinned longterm, and whose data will be accessed.
|
||||||
|
|
||||||
|
Unit testing
|
||||||
|
============
|
||||||
|
This file::
|
||||||
|
|
||||||
|
tools/testing/selftests/vm/gup_benchmark.c
|
||||||
|
|
||||||
|
has the following new calls to exercise the new pin*() wrapper functions:
|
||||||
|
|
||||||
|
* PIN_FAST_BENCHMARK (./gup_benchmark -a)
|
||||||
|
* PIN_BENCHMARK (./gup_benchmark -b)
|
||||||
|
|
||||||
|
You can monitor how many total dma-pinned pages have been acquired and released
|
||||||
|
since the system was booted, via two new /proc/vmstat entries: ::
|
||||||
|
|
||||||
|
/proc/vmstat/nr_foll_pin_requested
|
||||||
|
/proc/vmstat/nr_foll_pin_requested
|
||||||
|
|
||||||
|
Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
|
||||||
|
because there is a noticeable performance drop in unpin_user_page(), when they
|
||||||
|
are activated.
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
|
||||||
|
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
|
||||||
|
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
|
||||||
|
|
||||||
|
John Hubbard, October, 2019
|
@@ -31,33 +31,23 @@ The counterparts to those functions are listed below.
|
|||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
int crypto_unregister_alg(struct crypto_alg *alg);
|
void crypto_unregister_alg(struct crypto_alg *alg);
|
||||||
int crypto_unregister_algs(struct crypto_alg *algs, int count);
|
void crypto_unregister_algs(struct crypto_alg *algs, int count);
|
||||||
|
|
||||||
|
|
||||||
Notice that both registration and unregistration functions do return a
|
The registration functions return 0 on success, or a negative errno
|
||||||
value, so make sure to handle errors. A return code of zero implies
|
value on failure. crypto_register_algs() succeeds only if it
|
||||||
success. Any return code < 0 implies an error.
|
successfully registered all the given algorithms; if it fails partway
|
||||||
|
through, then any changes are rolled back.
|
||||||
|
|
||||||
The bulk registration/unregistration functions register/unregister each
|
The unregistration functions always succeed, so they don't have a
|
||||||
transformation in the given array of length count. They handle errors as
|
return value. Don't try to unregister algorithms that aren't
|
||||||
follows:
|
currently registered.
|
||||||
|
|
||||||
- crypto_register_algs() succeeds if and only if it successfully
|
|
||||||
registers all the given transformations. If an error occurs partway
|
|
||||||
through, then it rolls back successful registrations before returning
|
|
||||||
the error code. Note that if a driver needs to handle registration
|
|
||||||
errors for individual transformations, then it will need to use the
|
|
||||||
non-bulk function crypto_register_alg() instead.
|
|
||||||
|
|
||||||
- crypto_unregister_algs() tries to unregister all the given
|
|
||||||
transformations, continuing on error. It logs errors and always
|
|
||||||
returns zero.
|
|
||||||
|
|
||||||
Single-Block Symmetric Ciphers [CIPHER]
|
Single-Block Symmetric Ciphers [CIPHER]
|
||||||
---------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
Example of transformations: aes, arc4, ...
|
Example of transformations: aes, serpent, ...
|
||||||
|
|
||||||
This section describes the simplest of all transformation
|
This section describes the simplest of all transformation
|
||||||
implementations, that being the CIPHER type used for symmetric ciphers.
|
implementations, that being the CIPHER type used for symmetric ciphers.
|
||||||
@@ -108,7 +98,7 @@ is also valid:
|
|||||||
Multi-Block Ciphers
|
Multi-Block Ciphers
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Example of transformations: cbc(aes), ecb(arc4), ...
|
Example of transformations: cbc(aes), chacha20, ...
|
||||||
|
|
||||||
This section describes the multi-block cipher transformation
|
This section describes the multi-block cipher transformation
|
||||||
implementations. The multi-block ciphers are used for transformations
|
implementations. The multi-block ciphers are used for transformations
|
||||||
@@ -169,10 +159,10 @@ are as follows:
|
|||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
int crypto_unregister_ahash(struct ahash_alg *alg);
|
void crypto_unregister_ahash(struct ahash_alg *alg);
|
||||||
|
|
||||||
int crypto_unregister_shash(struct shash_alg *alg);
|
void crypto_unregister_shash(struct shash_alg *alg);
|
||||||
int crypto_unregister_shashes(struct shash_alg *algs, int count);
|
void crypto_unregister_shashes(struct shash_alg *algs, int count);
|
||||||
|
|
||||||
|
|
||||||
Cipher Definition With struct shash_alg and ahash_alg
|
Cipher Definition With struct shash_alg and ahash_alg
|
||||||
|
@@ -21,8 +21,8 @@ global variables yet.
|
|||||||
|
|
||||||
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
|
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
|
||||||
|
|
||||||
Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390
|
Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and
|
||||||
architectures, and tag-based KASAN is supported only for arm64.
|
riscv architectures, and tag-based KASAN is supported only for arm64.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
@@ -29,7 +29,8 @@ Yes, well, mostly.
|
|||||||
|
|
||||||
For the most part, the KUnit core framework (what you use to write the tests)
|
For the most part, the KUnit core framework (what you use to write the tests)
|
||||||
can compile to any architecture; it compiles like just another part of the
|
can compile to any architecture; it compiles like just another part of the
|
||||||
kernel and runs when the kernel boots. However, there is some infrastructure,
|
kernel and runs when the kernel boots, or when built as a module, when the
|
||||||
|
module is loaded. However, there is some infrastructure,
|
||||||
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
|
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
|
||||||
other architectures.
|
other architectures.
|
||||||
|
|
||||||
|
@@ -49,6 +49,9 @@ to a standalone program that can be run like any other program directly inside
|
|||||||
of a host operating system; to be clear, it does not require any virtualization
|
of a host operating system; to be clear, it does not require any virtualization
|
||||||
support; it is just a regular program.
|
support; it is just a regular program.
|
||||||
|
|
||||||
|
Alternatively, kunit and kunit tests can be built as modules and tests will
|
||||||
|
run when the test module is loaded.
|
||||||
|
|
||||||
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
||||||
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
||||||
deal to some people, but having such fast and easy to run tests fundamentally
|
deal to some people, but having such fast and easy to run tests fundamentally
|
||||||
|
@@ -539,6 +539,23 @@ Interspersed in the kernel logs you might see the following:
|
|||||||
|
|
||||||
Congratulations, you just ran a KUnit test on the x86 architecture!
|
Congratulations, you just ran a KUnit test on the x86 architecture!
|
||||||
|
|
||||||
|
In a similar manner, kunit and kunit tests can also be built as modules,
|
||||||
|
so if you wanted to run tests in this way you might add the following config
|
||||||
|
options to your ``.config``:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
CONFIG_KUNIT=m
|
||||||
|
CONFIG_KUNIT_EXAMPLE_TEST=m
|
||||||
|
|
||||||
|
Once the kernel is built and installed, a simple
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
modprobe example-test
|
||||||
|
|
||||||
|
...will run the tests.
|
||||||
|
|
||||||
Writing new tests for other architectures
|
Writing new tests for other architectures
|
||||||
-----------------------------------------
|
-----------------------------------------
|
||||||
|
|
||||||
|
@@ -59,6 +59,7 @@ properties:
|
|||||||
- friendlyarm,nanopi-k2
|
- friendlyarm,nanopi-k2
|
||||||
- hardkernel,odroid-c2
|
- hardkernel,odroid-c2
|
||||||
- nexbox,a95x
|
- nexbox,a95x
|
||||||
|
- videostrong,kii-pro
|
||||||
- wetek,hub
|
- wetek,hub
|
||||||
- wetek,play2
|
- wetek,play2
|
||||||
- const: amlogic,meson-gxbb
|
- const: amlogic,meson-gxbb
|
||||||
@@ -104,6 +105,7 @@ properties:
|
|||||||
- enum:
|
- enum:
|
||||||
- amlogic,p230
|
- amlogic,p230
|
||||||
- amlogic,p231
|
- amlogic,p231
|
||||||
|
- libretech,aml-s905d-pc
|
||||||
- phicomm,n1
|
- phicomm,n1
|
||||||
- const: amlogic,s905d
|
- const: amlogic,s905d
|
||||||
- const: amlogic,meson-gxl
|
- const: amlogic,meson-gxl
|
||||||
@@ -115,6 +117,7 @@ properties:
|
|||||||
- amlogic,q201
|
- amlogic,q201
|
||||||
- khadas,vim2
|
- khadas,vim2
|
||||||
- kingnovel,r-box-pro
|
- kingnovel,r-box-pro
|
||||||
|
- libretech,aml-s912-pc
|
||||||
- nexbox,a1
|
- nexbox,a1
|
||||||
- tronsmart,vega-s96
|
- tronsmart,vega-s96
|
||||||
- const: amlogic,s912
|
- const: amlogic,s912
|
||||||
|
@@ -121,7 +121,7 @@ Required properties (in root node):
|
|||||||
Required nodes:
|
Required nodes:
|
||||||
|
|
||||||
- soc: some node of the RealView platforms must be the SoC
|
- soc: some node of the RealView platforms must be the SoC
|
||||||
node that contain the SoC-specific devices, withe the compatible
|
node that contain the SoC-specific devices, with the compatible
|
||||||
string set to one of these tuples:
|
string set to one of these tuples:
|
||||||
"arm,realview-eb-soc", "simple-bus"
|
"arm,realview-eb-soc", "simple-bus"
|
||||||
"arm,realview-pb1176-soc", "simple-bus"
|
"arm,realview-pb1176-soc", "simple-bus"
|
||||||
|
@@ -35,6 +35,16 @@ properties:
|
|||||||
- atmel,at91sam9x60
|
- atmel,at91sam9x60
|
||||||
- const: atmel,at91sam9
|
- const: atmel,at91sam9
|
||||||
|
|
||||||
|
- items:
|
||||||
|
- enum:
|
||||||
|
- overkiz,kizboxmini-base # Overkiz kizbox Mini Base Board
|
||||||
|
- overkiz,kizboxmini-mb # Overkiz kizbox Mini Mother Board
|
||||||
|
- overkiz,kizboxmini-rd # Overkiz kizbox Mini RailDIN
|
||||||
|
- overkiz,smartkiz # Overkiz SmartKiz Board
|
||||||
|
- const: atmel,at91sam9g25
|
||||||
|
- const: atmel,at91sam9x5
|
||||||
|
- const: atmel,at91sam9
|
||||||
|
|
||||||
- items:
|
- items:
|
||||||
- enum:
|
- enum:
|
||||||
- atmel,at91sam9g15
|
- atmel,at91sam9g15
|
||||||
@@ -52,11 +62,32 @@ properties:
|
|||||||
- const: atmel,sama5d2
|
- const: atmel,sama5d2
|
||||||
- const: atmel,sama5
|
- const: atmel,sama5
|
||||||
|
|
||||||
|
- description: Microchip SAMA5D27 WLSOM1
|
||||||
|
items:
|
||||||
|
- const: microchip,sama5d27-wlsom1
|
||||||
|
- const: atmel,sama5d27
|
||||||
|
- const: atmel,sama5d2
|
||||||
|
- const: atmel,sama5
|
||||||
|
|
||||||
|
- description: Microchip SAMA5D27 WLSOM1 Evaluation Kit
|
||||||
|
items:
|
||||||
|
- const: microchip,sama5d27-wlsom1-ek
|
||||||
|
- const: microchip,sama5d27-wlsom1
|
||||||
|
- const: atmel,sama5d27
|
||||||
|
- const: atmel,sama5d2
|
||||||
|
- const: atmel,sama5
|
||||||
|
|
||||||
- items:
|
- items:
|
||||||
- const: atmel,sama5d27
|
- const: atmel,sama5d27
|
||||||
- const: atmel,sama5d2
|
- const: atmel,sama5d2
|
||||||
- const: atmel,sama5
|
- const: atmel,sama5
|
||||||
|
|
||||||
|
- description: SAM9X60-EK board
|
||||||
|
items:
|
||||||
|
- const: microchip,sam9x60ek
|
||||||
|
- const: microchip,sam9x60
|
||||||
|
- const: atmel,at91sam9
|
||||||
|
|
||||||
- description: Nattis v2 board with Natte v2 power board
|
- description: Nattis v2 board with Natte v2 power board
|
||||||
items:
|
items:
|
||||||
- const: axentia,nattis-2
|
- const: axentia,nattis-2
|
||||||
|
@@ -10,6 +10,12 @@ PIT Timer required properties:
|
|||||||
- interrupts: Should contain interrupt for the PIT which is the IRQ line
|
- interrupts: Should contain interrupt for the PIT which is the IRQ line
|
||||||
shared across all System Controller members.
|
shared across all System Controller members.
|
||||||
|
|
||||||
|
PIT64B Timer required properties:
|
||||||
|
- compatible: Should be "microchip,sam9x60-pit64b"
|
||||||
|
- reg: Should contain registers location and length
|
||||||
|
- interrupts: Should contain interrupt for PIT64B timer
|
||||||
|
- clocks: Should contain the available clock sources for PIT64B timer.
|
||||||
|
|
||||||
System Timer (ST) required properties:
|
System Timer (ST) required properties:
|
||||||
- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd"
|
- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd"
|
||||||
- reg: Should contain registers location and length
|
- reg: Should contain registers location and length
|
||||||
@@ -39,6 +45,7 @@ RAMC SDRAM/DDR Controller required properties:
|
|||||||
"atmel,at91sam9260-sdramc",
|
"atmel,at91sam9260-sdramc",
|
||||||
"atmel,at91sam9g45-ddramc",
|
"atmel,at91sam9g45-ddramc",
|
||||||
"atmel,sama5d3-ddramc",
|
"atmel,sama5d3-ddramc",
|
||||||
|
"microchip,sam9x60-ddramc"
|
||||||
- reg: Should contain registers location and length
|
- reg: Should contain registers location and length
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
@@ -242,6 +242,21 @@ properties:
|
|||||||
|
|
||||||
where voltage is in V, frequency is in MHz.
|
where voltage is in V, frequency is in MHz.
|
||||||
|
|
||||||
|
power-domains:
|
||||||
|
$ref: '/schemas/types.yaml#/definitions/phandle-array'
|
||||||
|
description:
|
||||||
|
List of phandles and PM domain specifiers, as defined by bindings of the
|
||||||
|
PM domain provider (see also ../power_domain.txt).
|
||||||
|
|
||||||
|
power-domain-names:
|
||||||
|
$ref: '/schemas/types.yaml#/definitions/string-array'
|
||||||
|
description:
|
||||||
|
A list of power domain name strings sorted in the same order as the
|
||||||
|
power-domains property.
|
||||||
|
|
||||||
|
For PSCI based platforms, the name corresponding to the index of the PSCI
|
||||||
|
PM domain provider, must be "psci".
|
||||||
|
|
||||||
qcom,saw:
|
qcom,saw:
|
||||||
$ref: '/schemas/types.yaml#/definitions/phandle'
|
$ref: '/schemas/types.yaml#/definitions/phandle'
|
||||||
description: |
|
description: |
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: GPL-2.0
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
%YAML 1.2
|
%YAML 1.2
|
||||||
---
|
---
|
||||||
$id: http://devicetree.org/schemas/bindings/arm/fsl.yaml#
|
$id: http://devicetree.org/schemas/arm/fsl.yaml#
|
||||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
title: Freescale i.MX Platforms Device Tree Bindings
|
title: Freescale i.MX Platforms Device Tree Bindings
|
||||||
@@ -128,6 +128,27 @@ properties:
|
|||||||
- variscite,dt6customboard
|
- variscite,dt6customboard
|
||||||
- const: fsl,imx6q
|
- const: fsl,imx6q
|
||||||
|
|
||||||
|
- description: i.MX6Q Gateworks Ventana Boards
|
||||||
|
items:
|
||||||
|
- enum:
|
||||||
|
- gw,imx6q-gw51xx
|
||||||
|
- gw,imx6q-gw52xx
|
||||||
|
- gw,imx6q-gw53xx
|
||||||
|
- gw,imx6q-gw5400-a
|
||||||
|
- gw,imx6q-gw54xx
|
||||||
|
- gw,imx6q-gw551x
|
||||||
|
- gw,imx6q-gw552x
|
||||||
|
- gw,imx6q-gw553x
|
||||||
|
- gw,imx6q-gw560x
|
||||||
|
- gw,imx6q-gw5903
|
||||||
|
- gw,imx6q-gw5904
|
||||||
|
- gw,imx6q-gw5907
|
||||||
|
- gw,imx6q-gw5910
|
||||||
|
- gw,imx6q-gw5912
|
||||||
|
- gw,imx6q-gw5913
|
||||||
|
- const: gw,ventana
|
||||||
|
- const: fsl,imx6q
|
||||||
|
|
||||||
- description: i.MX6QP based Boards
|
- description: i.MX6QP based Boards
|
||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
@@ -154,10 +175,31 @@ properties:
|
|||||||
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
|
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
|
||||||
- const: fsl,imx6dl
|
- const: fsl,imx6dl
|
||||||
|
|
||||||
|
- description: i.MX6DL Gateworks Ventana Boards
|
||||||
|
items:
|
||||||
|
- enum:
|
||||||
|
- gw,imx6dl-gw51xx
|
||||||
|
- gw,imx6dl-gw52xx
|
||||||
|
- gw,imx6dl-gw53xx
|
||||||
|
- gw,imx6dl-gw54xx
|
||||||
|
- gw,imx6dl-gw551x
|
||||||
|
- gw,imx6dl-gw552x
|
||||||
|
- gw,imx6dl-gw553x
|
||||||
|
- gw,imx6dl-gw560x
|
||||||
|
- gw,imx6dl-gw5903
|
||||||
|
- gw,imx6dl-gw5904
|
||||||
|
- gw,imx6dl-gw5907
|
||||||
|
- gw,imx6dl-gw5910
|
||||||
|
- gw,imx6dl-gw5912
|
||||||
|
- gw,imx6dl-gw5913
|
||||||
|
- const: gw,ventana
|
||||||
|
- const: fsl,imx6dl
|
||||||
|
|
||||||
- description: i.MX6SL based Boards
|
- description: i.MX6SL based Boards
|
||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
|
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
|
||||||
|
- kobo,tolino-shine3
|
||||||
- const: fsl,imx6sl
|
- const: fsl,imx6sl
|
||||||
|
|
||||||
- description: i.MX6SLL based Boards
|
- description: i.MX6SLL based Boards
|
||||||
@@ -172,6 +214,7 @@ properties:
|
|||||||
- enum:
|
- enum:
|
||||||
- fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board
|
- fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board
|
||||||
- fsl,imx6sx-sdb # i.MX6 SoloX SDB Board
|
- fsl,imx6sx-sdb # i.MX6 SoloX SDB Board
|
||||||
|
- fsl,imx6sx-sdb-reva # i.MX6 SoloX SDB Rev-A Board
|
||||||
- const: fsl,imx6sx
|
- const: fsl,imx6sx
|
||||||
|
|
||||||
- description: i.MX6UL based Boards
|
- description: i.MX6UL based Boards
|
||||||
@@ -239,6 +282,7 @@ properties:
|
|||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
- fsl,imx7d-sdb # i.MX7 SabreSD Board
|
- fsl,imx7d-sdb # i.MX7 SabreSD Board
|
||||||
|
- fsl,imx7d-sdb-reva # i.MX7 SabreSD Rev-A Board
|
||||||
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
|
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
|
||||||
- toradex,colibri-imx7d # Colibri iMX7 Dual Module
|
- toradex,colibri-imx7d # Colibri iMX7 Dual Module
|
||||||
- toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
|
- toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
|
||||||
@@ -263,6 +307,7 @@ properties:
|
|||||||
- description: i.MX7ULP based Boards
|
- description: i.MX7ULP based Boards
|
||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
|
- ea,imx7ulp-com # i.MX7ULP Embedded Artists COM Board
|
||||||
- fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
|
- fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
|
||||||
- const: fsl,imx7ulp
|
- const: fsl,imx7ulp
|
||||||
|
|
||||||
@@ -283,7 +328,9 @@ properties:
|
|||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
- boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board
|
- boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board
|
||||||
|
- einfochips,imx8mq-thor96 # i.MX8MQ Thor96 Board
|
||||||
- fsl,imx8mq-evk # i.MX8MQ EVK Board
|
- fsl,imx8mq-evk # i.MX8MQ EVK Board
|
||||||
|
- google,imx8mq-phanbell # Google Coral Edge TPU
|
||||||
- purism,librem5-devkit # Purism Librem5 devkit
|
- purism,librem5-devkit # Purism Librem5 devkit
|
||||||
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
|
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
|
||||||
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
|
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
|
||||||
@@ -385,6 +432,13 @@ properties:
|
|||||||
- fsl,ls2088a-rdb
|
- fsl,ls2088a-rdb
|
||||||
- const: fsl,ls2088a
|
- const: fsl,ls2088a
|
||||||
|
|
||||||
|
- description: LX2160A based Boards
|
||||||
|
items:
|
||||||
|
- enum:
|
||||||
|
- fsl,lx2160a-qds
|
||||||
|
- fsl,lx2160a-rdb
|
||||||
|
- const: fsl,lx2160a
|
||||||
|
|
||||||
- description: S32V234 based Boards
|
- description: S32V234 based Boards
|
||||||
items:
|
items:
|
||||||
- enum:
|
- enum:
|
||||||
|
@@ -1,706 +0,0 @@
|
|||||||
==========================================
|
|
||||||
ARM idle states binding description
|
|
||||||
==========================================
|
|
||||||
|
|
||||||
==========================================
|
|
||||||
1 - Introduction
|
|
||||||
==========================================
|
|
||||||
|
|
||||||
ARM systems contain HW capable of managing power consumption dynamically,
|
|
||||||
where cores can be put in different low-power states (ranging from simple
|
|
||||||
wfi to power gating) according to OS PM policies. The CPU states representing
|
|
||||||
the range of dynamic idle states that a processor can enter at run-time, can be
|
|
||||||
specified through device tree bindings representing the parameters required
|
|
||||||
to enter/exit specific idle states on a given processor.
|
|
||||||
|
|
||||||
According to the Server Base System Architecture document (SBSA, [3]), the
|
|
||||||
power states an ARM CPU can be put into are identified by the following list:
|
|
||||||
|
|
||||||
- Running
|
|
||||||
- Idle_standby
|
|
||||||
- Idle_retention
|
|
||||||
- Sleep
|
|
||||||
- Off
|
|
||||||
|
|
||||||
The power states described in the SBSA document define the basic CPU states on
|
|
||||||
top of which ARM platforms implement power management schemes that allow an OS
|
|
||||||
PM implementation to put the processor in different idle states (which include
|
|
||||||
states listed above; "off" state is not an idle state since it does not have
|
|
||||||
wake-up capabilities, hence it is not considered in this document).
|
|
||||||
|
|
||||||
Idle state parameters (e.g. entry latency) are platform specific and need to be
|
|
||||||
characterized with bindings that provide the required information to OS PM
|
|
||||||
code so that it can build the required tables and use them at runtime.
|
|
||||||
|
|
||||||
The device tree binding definition for ARM idle states is the subject of this
|
|
||||||
document.
|
|
||||||
|
|
||||||
===========================================
|
|
||||||
2 - idle-states definitions
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
Idle states are characterized for a specific system through a set of
|
|
||||||
timing and energy related properties, that underline the HW behaviour
|
|
||||||
triggered upon idle states entry and exit.
|
|
||||||
|
|
||||||
The following diagram depicts the CPU execution phases and related timing
|
|
||||||
properties required to enter and exit an idle state:
|
|
||||||
|
|
||||||
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
|
|
||||||
| | | | |
|
|
||||||
|
|
||||||
|<------ entry ------->|
|
|
||||||
| latency |
|
|
||||||
|<- exit ->|
|
|
||||||
| latency |
|
|
||||||
|<-------- min-residency -------->|
|
|
||||||
|<------- wakeup-latency ------->|
|
|
||||||
|
|
||||||
Diagram 1: CPU idle state execution phases
|
|
||||||
|
|
||||||
EXEC: Normal CPU execution.
|
|
||||||
|
|
||||||
PREP: Preparation phase before committing the hardware to idle mode
|
|
||||||
like cache flushing. This is abortable on pending wake-up
|
|
||||||
event conditions. The abort latency is assumed to be negligible
|
|
||||||
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
|
|
||||||
goes back to EXEC. This phase is optional. If not abortable,
|
|
||||||
this should be included in the ENTRY phase instead.
|
|
||||||
|
|
||||||
ENTRY: The hardware is committed to idle mode. This period must run
|
|
||||||
to completion up to IDLE before anything else can happen.
|
|
||||||
|
|
||||||
IDLE: This is the actual energy-saving idle period. This may last
|
|
||||||
between 0 and infinite time, until a wake-up event occurs.
|
|
||||||
|
|
||||||
EXIT: Period during which the CPU is brought back to operational
|
|
||||||
mode (EXEC).
|
|
||||||
|
|
||||||
entry-latency: Worst case latency required to enter the idle state. The
|
|
||||||
exit-latency may be guaranteed only after entry-latency has passed.
|
|
||||||
|
|
||||||
min-residency: Minimum period, including preparation and entry, for a given
|
|
||||||
idle state to be worthwhile energywise.
|
|
||||||
|
|
||||||
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
|
|
||||||
CPU being able to execute normal code again. If not specified, this is assumed
|
|
||||||
to be entry-latency + exit-latency.
|
|
||||||
|
|
||||||
These timing parameters can be used by an OS in different circumstances.
|
|
||||||
|
|
||||||
An idle CPU requires the expected min-residency time to select the most
|
|
||||||
appropriate idle state based on the expected expiry time of the next IRQ
|
|
||||||
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
|
|
||||||
|
|
||||||
An operating system scheduler may need to compute the shortest wake-up delay
|
|
||||||
for CPUs in the system by detecting how long will it take to get a CPU out
|
|
||||||
of an idle state, e.g.:
|
|
||||||
|
|
||||||
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
|
|
||||||
|
|
||||||
In other words, the scheduler can make its scheduling decision by selecting
|
|
||||||
(e.g. waking-up) the CPU with the shortest wake-up delay.
|
|
||||||
The wake-up delay must take into account the entry latency if that period
|
|
||||||
has not expired. The abortable nature of the PREP period can be ignored
|
|
||||||
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
|
|
||||||
the worst case since it depends on the CPU operating conditions, i.e. caches
|
|
||||||
state).
|
|
||||||
|
|
||||||
An OS has to reliably probe the wakeup-latency since some devices can enforce
|
|
||||||
latency constraint guarantees to work properly, so the OS has to detect the
|
|
||||||
worst case wake-up latency it can incur if a CPU is allowed to enter an
|
|
||||||
idle state, and possibly to prevent that to guarantee reliable device
|
|
||||||
functioning.
|
|
||||||
|
|
||||||
The min-residency time parameter deserves further explanation since it is
|
|
||||||
expressed in time units but must factor in energy consumption coefficients.
|
|
||||||
|
|
||||||
The energy consumption of a cpu when it enters a power state can be roughly
|
|
||||||
characterised by the following graph:
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
|
|
||||||
e |
|
|
||||||
n | /---
|
|
||||||
e | /------
|
|
||||||
r | /------
|
|
||||||
g | /-----
|
|
||||||
y | /------
|
|
||||||
| ----
|
|
||||||
| /|
|
|
||||||
| / |
|
|
||||||
| / |
|
|
||||||
| / |
|
|
||||||
| / |
|
|
||||||
| / |
|
|
||||||
|/ |
|
|
||||||
-----|-------+----------------------------------
|
|
||||||
0| 1 time(ms)
|
|
||||||
|
|
||||||
Graph 1: Energy vs time example
|
|
||||||
|
|
||||||
The graph is split in two parts delimited by time 1ms on the X-axis.
|
|
||||||
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
|
|
||||||
and denotes the energy costs incurred while entering and leaving the idle
|
|
||||||
state.
|
|
||||||
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
|
|
||||||
shallower slope and essentially represents the energy consumption of the idle
|
|
||||||
state.
|
|
||||||
|
|
||||||
min-residency is defined for a given idle state as the minimum expected
|
|
||||||
residency time for a state (inclusive of preparation and entry) after
|
|
||||||
which choosing that state become the most energy efficient option. A good
|
|
||||||
way to visualise this, is by taking the same graph above and comparing some
|
|
||||||
states energy consumptions plots.
|
|
||||||
|
|
||||||
For sake of simplicity, let's consider a system with two idle states IDLE1,
|
|
||||||
and IDLE2:
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
|
|
||||||
| /-- IDLE1
|
|
||||||
e | /---
|
|
||||||
n | /----
|
|
||||||
e | /---
|
|
||||||
r | /-----/--------- IDLE2
|
|
||||||
g | /-------/---------
|
|
||||||
y | ------------ /---|
|
|
||||||
| / /---- |
|
|
||||||
| / /--- |
|
|
||||||
| / /---- |
|
|
||||||
| / /--- |
|
|
||||||
| --- |
|
|
||||||
| / |
|
|
||||||
| / |
|
|
||||||
|/ | time
|
|
||||||
---/----------------------------+------------------------
|
|
||||||
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
|
||||||
|
|
|
||||||
IDLE2-min-residency
|
|
||||||
|
|
||||||
Graph 2: idle states min-residency example
|
|
||||||
|
|
||||||
In graph 2 above, that takes into account idle states entry/exit energy
|
|
||||||
costs, it is clear that if the idle state residency time (i.e. time till next
|
|
||||||
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
|
|
||||||
choice energywise.
|
|
||||||
|
|
||||||
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
|
|
||||||
than IDLE2.
|
|
||||||
|
|
||||||
However, the lower power consumption (i.e. shallower energy curve slope) of
|
|
||||||
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
|
|
||||||
efficient.
|
|
||||||
|
|
||||||
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
|
|
||||||
shallower states in a system with multiple idle states) is defined
|
|
||||||
IDLE2-min-residency and corresponds to the time when energy consumption of
|
|
||||||
IDLE1 and IDLE2 states breaks even.
|
|
||||||
|
|
||||||
The definitions provided in this section underpin the idle states
|
|
||||||
properties specification that is the subject of the following sections.
|
|
||||||
|
|
||||||
===========================================
|
|
||||||
3 - idle-states node
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
ARM processor idle states are defined within the idle-states node, which is
|
|
||||||
a direct child of the cpus node [1] and provides a container where the
|
|
||||||
processor idle states, defined as device tree nodes, are listed.
|
|
||||||
|
|
||||||
- idle-states node
|
|
||||||
|
|
||||||
Usage: Optional - On ARM systems, it is a container of processor idle
|
|
||||||
states nodes. If the system does not provide CPU
|
|
||||||
power management capabilities, or the processor just
|
|
||||||
supports idle_standby, an idle-states node is not
|
|
||||||
required.
|
|
||||||
|
|
||||||
Description: idle-states node is a container node, where its
|
|
||||||
subnodes describe the CPU idle states.
|
|
||||||
|
|
||||||
Node name must be "idle-states".
|
|
||||||
|
|
||||||
The idle-states node's parent node must be the cpus node.
|
|
||||||
|
|
||||||
The idle-states node's child nodes can be:
|
|
||||||
|
|
||||||
- one or more state nodes
|
|
||||||
|
|
||||||
Any other configuration is considered invalid.
|
|
||||||
|
|
||||||
An idle-states node defines the following properties:
|
|
||||||
|
|
||||||
- entry-method
|
|
||||||
Value type: <stringlist>
|
|
||||||
Usage and definition depend on ARM architecture version.
|
|
||||||
# On ARM v8 64-bit this property is required and must
|
|
||||||
be:
|
|
||||||
- "psci"
|
|
||||||
# On ARM 32-bit systems this property is optional
|
|
||||||
|
|
||||||
This assumes that the "enable-method" property is set to "psci" in the cpu
|
|
||||||
node[6] that is responsible for setting up CPU idle management in the OS
|
|
||||||
implementation.
|
|
||||||
|
|
||||||
The nodes describing the idle states (state) can only be defined
|
|
||||||
within the idle-states node, any other configuration is considered invalid
|
|
||||||
and therefore must be ignored.
|
|
||||||
|
|
||||||
===========================================
|
|
||||||
4 - state node
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
A state node represents an idle state description and must be defined as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
- state node
|
|
||||||
|
|
||||||
Description: must be child of the idle-states node
|
|
||||||
|
|
||||||
The state node name shall follow standard device tree naming
|
|
||||||
rules ([5], 2.2.1 "Node names"), in particular state nodes which
|
|
||||||
are siblings within a single common parent must be given a unique name.
|
|
||||||
|
|
||||||
The idle state entered by executing the wfi instruction (idle_standby
|
|
||||||
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
|
|
||||||
must not be listed.
|
|
||||||
|
|
||||||
With the definitions provided above, the following list represents
|
|
||||||
the valid properties for a state node:
|
|
||||||
|
|
||||||
- compatible
|
|
||||||
Usage: Required
|
|
||||||
Value type: <stringlist>
|
|
||||||
Definition: Must be "arm,idle-state".
|
|
||||||
|
|
||||||
- local-timer-stop
|
|
||||||
Usage: See definition
|
|
||||||
Value type: <none>
|
|
||||||
Definition: if present the CPU local timer control logic is
|
|
||||||
lost on state entry, otherwise it is retained.
|
|
||||||
|
|
||||||
- entry-latency-us
|
|
||||||
Usage: Required
|
|
||||||
Value type: <prop-encoded-array>
|
|
||||||
Definition: u32 value representing worst case latency in
|
|
||||||
microseconds required to enter the idle state.
|
|
||||||
|
|
||||||
- exit-latency-us
|
|
||||||
Usage: Required
|
|
||||||
Value type: <prop-encoded-array>
|
|
||||||
Definition: u32 value representing worst case latency
|
|
||||||
in microseconds required to exit the idle state.
|
|
||||||
The exit-latency-us duration may be guaranteed
|
|
||||||
only after entry-latency-us has passed.
|
|
||||||
|
|
||||||
- min-residency-us
|
|
||||||
Usage: Required
|
|
||||||
Value type: <prop-encoded-array>
|
|
||||||
Definition: u32 value representing minimum residency duration
|
|
||||||
in microseconds, inclusive of preparation and
|
|
||||||
entry, for this idle state to be considered
|
|
||||||
worthwhile energy wise (refer to section 2 of
|
|
||||||
this document for a complete description).
|
|
||||||
|
|
||||||
- wakeup-latency-us:
|
|
||||||
Usage: Optional
|
|
||||||
Value type: <prop-encoded-array>
|
|
||||||
Definition: u32 value representing maximum delay between the
|
|
||||||
signaling of a wake-up event and the CPU being
|
|
||||||
able to execute normal code again. If omitted,
|
|
||||||
this is assumed to be equal to:
|
|
||||||
|
|
||||||
entry-latency-us + exit-latency-us
|
|
||||||
|
|
||||||
It is important to supply this value on systems
|
|
||||||
where the duration of PREP phase (see diagram 1,
|
|
||||||
section 2) is non-neglibigle.
|
|
||||||
In such systems entry-latency-us + exit-latency-us
|
|
||||||
will exceed wakeup-latency-us by this duration.
|
|
||||||
|
|
||||||
- status:
|
|
||||||
Usage: Optional
|
|
||||||
Value type: <string>
|
|
||||||
Definition: A standard device tree property [5] that indicates
|
|
||||||
the operational status of an idle-state.
|
|
||||||
If present, it shall be:
|
|
||||||
"okay": to indicate that the idle state is
|
|
||||||
operational.
|
|
||||||
"disabled": to indicate that the idle state has
|
|
||||||
been disabled in firmware so it is not
|
|
||||||
operational.
|
|
||||||
If the property is not present the idle-state must
|
|
||||||
be considered operational.
|
|
||||||
|
|
||||||
- idle-state-name:
|
|
||||||
Usage: Optional
|
|
||||||
Value type: <string>
|
|
||||||
Definition: A string used as a descriptive name for the idle
|
|
||||||
state.
|
|
||||||
|
|
||||||
In addition to the properties listed above, a state node may require
|
|
||||||
additional properties specific to the entry-method defined in the
|
|
||||||
idle-states node. Please refer to the entry-method bindings
|
|
||||||
documentation for properties definitions.
|
|
||||||
|
|
||||||
===========================================
|
|
||||||
4 - Examples
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
Example 1 (ARM 64-bit, 16-cpu system, PSCI enable-method):
|
|
||||||
|
|
||||||
cpus {
|
|
||||||
#size-cells = <0>;
|
|
||||||
#address-cells = <2>;
|
|
||||||
|
|
||||||
CPU0: cpu@0 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x0>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU1: cpu@1 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x1>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU2: cpu@100 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x100>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU3: cpu@101 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x101>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU4: cpu@10000 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x10000>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU5: cpu@10001 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x10001>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU6: cpu@10100 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x10100>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU7: cpu@10101 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a57";
|
|
||||||
reg = <0x0 0x10101>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
|
||||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU8: cpu@100000000 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x0>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU9: cpu@100000001 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x1>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU10: cpu@100000100 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x100>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU11: cpu@100000101 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x101>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU12: cpu@100010000 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x10000>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU13: cpu@100010001 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x10001>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU14: cpu@100010100 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x10100>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU15: cpu@100010101 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a53";
|
|
||||||
reg = <0x1 0x10101>;
|
|
||||||
enable-method = "psci";
|
|
||||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
|
||||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
idle-states {
|
|
||||||
entry-method = "psci";
|
|
||||||
|
|
||||||
CPU_RETENTION_0_0: cpu-retention-0-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
arm,psci-suspend-param = <0x0010000>;
|
|
||||||
entry-latency-us = <20>;
|
|
||||||
exit-latency-us = <40>;
|
|
||||||
min-residency-us = <80>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_RETENTION_0: cluster-retention-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x1010000>;
|
|
||||||
entry-latency-us = <50>;
|
|
||||||
exit-latency-us = <100>;
|
|
||||||
min-residency-us = <250>;
|
|
||||||
wakeup-latency-us = <130>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x0010000>;
|
|
||||||
entry-latency-us = <250>;
|
|
||||||
exit-latency-us = <500>;
|
|
||||||
min-residency-us = <950>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x1010000>;
|
|
||||||
entry-latency-us = <600>;
|
|
||||||
exit-latency-us = <1100>;
|
|
||||||
min-residency-us = <2700>;
|
|
||||||
wakeup-latency-us = <1500>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU_RETENTION_1_0: cpu-retention-1-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
arm,psci-suspend-param = <0x0010000>;
|
|
||||||
entry-latency-us = <20>;
|
|
||||||
exit-latency-us = <40>;
|
|
||||||
min-residency-us = <90>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_RETENTION_1: cluster-retention-1 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x1010000>;
|
|
||||||
entry-latency-us = <50>;
|
|
||||||
exit-latency-us = <100>;
|
|
||||||
min-residency-us = <270>;
|
|
||||||
wakeup-latency-us = <100>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x0010000>;
|
|
||||||
entry-latency-us = <70>;
|
|
||||||
exit-latency-us = <100>;
|
|
||||||
min-residency-us = <300>;
|
|
||||||
wakeup-latency-us = <150>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
arm,psci-suspend-param = <0x1010000>;
|
|
||||||
entry-latency-us = <500>;
|
|
||||||
exit-latency-us = <1200>;
|
|
||||||
min-residency-us = <3500>;
|
|
||||||
wakeup-latency-us = <1300>;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
Example 2 (ARM 32-bit, 8-cpu system, two clusters):
|
|
||||||
|
|
||||||
cpus {
|
|
||||||
#size-cells = <0>;
|
|
||||||
#address-cells = <1>;
|
|
||||||
|
|
||||||
CPU0: cpu@0 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a15";
|
|
||||||
reg = <0x0>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU1: cpu@1 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a15";
|
|
||||||
reg = <0x1>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU2: cpu@2 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a15";
|
|
||||||
reg = <0x2>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU3: cpu@3 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a15";
|
|
||||||
reg = <0x3>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU4: cpu@100 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a7";
|
|
||||||
reg = <0x100>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU5: cpu@101 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a7";
|
|
||||||
reg = <0x101>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU6: cpu@102 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a7";
|
|
||||||
reg = <0x102>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU7: cpu@103 {
|
|
||||||
device_type = "cpu";
|
|
||||||
compatible = "arm,cortex-a7";
|
|
||||||
reg = <0x103>;
|
|
||||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
idle-states {
|
|
||||||
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
entry-latency-us = <200>;
|
|
||||||
exit-latency-us = <100>;
|
|
||||||
min-residency-us = <400>;
|
|
||||||
wakeup-latency-us = <250>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
entry-latency-us = <500>;
|
|
||||||
exit-latency-us = <1500>;
|
|
||||||
min-residency-us = <2500>;
|
|
||||||
wakeup-latency-us = <1700>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
entry-latency-us = <300>;
|
|
||||||
exit-latency-us = <500>;
|
|
||||||
min-residency-us = <900>;
|
|
||||||
wakeup-latency-us = <600>;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
|
||||||
compatible = "arm,idle-state";
|
|
||||||
local-timer-stop;
|
|
||||||
entry-latency-us = <800>;
|
|
||||||
exit-latency-us = <2000>;
|
|
||||||
min-residency-us = <6500>;
|
|
||||||
wakeup-latency-us = <2300>;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
===========================================
|
|
||||||
5 - References
|
|
||||||
===========================================
|
|
||||||
|
|
||||||
[1] ARM Linux Kernel documentation - CPUs bindings
|
|
||||||
Documentation/devicetree/bindings/arm/cpus.yaml
|
|
||||||
|
|
||||||
[2] ARM Linux Kernel documentation - PSCI bindings
|
|
||||||
Documentation/devicetree/bindings/arm/psci.yaml
|
|
||||||
|
|
||||||
[3] ARM Server Base System Architecture (SBSA)
|
|
||||||
http://infocenter.arm.com/help/index.jsp
|
|
||||||
|
|
||||||
[4] ARM Architecture Reference Manuals
|
|
||||||
http://infocenter.arm.com/help/index.jsp
|
|
||||||
|
|
||||||
[5] Devicetree Specification
|
|
||||||
https://www.devicetree.org/specifications/
|
|
||||||
|
|
||||||
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
|
|
||||||
Documentation/arm64/booting.rst
|
|
661
Documentation/devicetree/bindings/arm/idle-states.yaml
Normal file
661
Documentation/devicetree/bindings/arm/idle-states.yaml
Normal file
@@ -0,0 +1,661 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/arm/idle-states.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: ARM idle states binding description
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
|
||||||
|
|
||||||
|
description: |+
|
||||||
|
==========================================
|
||||||
|
1 - Introduction
|
||||||
|
==========================================
|
||||||
|
|
||||||
|
ARM systems contain HW capable of managing power consumption dynamically,
|
||||||
|
where cores can be put in different low-power states (ranging from simple wfi
|
||||||
|
to power gating) according to OS PM policies. The CPU states representing the
|
||||||
|
range of dynamic idle states that a processor can enter at run-time, can be
|
||||||
|
specified through device tree bindings representing the parameters required to
|
||||||
|
enter/exit specific idle states on a given processor.
|
||||||
|
|
||||||
|
According to the Server Base System Architecture document (SBSA, [3]), the
|
||||||
|
power states an ARM CPU can be put into are identified by the following list:
|
||||||
|
|
||||||
|
- Running
|
||||||
|
- Idle_standby
|
||||||
|
- Idle_retention
|
||||||
|
- Sleep
|
||||||
|
- Off
|
||||||
|
|
||||||
|
The power states described in the SBSA document define the basic CPU states on
|
||||||
|
top of which ARM platforms implement power management schemes that allow an OS
|
||||||
|
PM implementation to put the processor in different idle states (which include
|
||||||
|
states listed above; "off" state is not an idle state since it does not have
|
||||||
|
wake-up capabilities, hence it is not considered in this document).
|
||||||
|
|
||||||
|
Idle state parameters (e.g. entry latency) are platform specific and need to
|
||||||
|
be characterized with bindings that provide the required information to OS PM
|
||||||
|
code so that it can build the required tables and use them at runtime.
|
||||||
|
|
||||||
|
The device tree binding definition for ARM idle states is the subject of this
|
||||||
|
document.
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
2 - idle-states definitions
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
Idle states are characterized for a specific system through a set of
|
||||||
|
timing and energy related properties, that underline the HW behaviour
|
||||||
|
triggered upon idle states entry and exit.
|
||||||
|
|
||||||
|
The following diagram depicts the CPU execution phases and related timing
|
||||||
|
properties required to enter and exit an idle state:
|
||||||
|
|
||||||
|
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
|
||||||
|
| | | | |
|
||||||
|
|
||||||
|
|<------ entry ------->|
|
||||||
|
| latency |
|
||||||
|
|<- exit ->|
|
||||||
|
| latency |
|
||||||
|
|<-------- min-residency -------->|
|
||||||
|
|<------- wakeup-latency ------->|
|
||||||
|
|
||||||
|
Diagram 1: CPU idle state execution phases
|
||||||
|
|
||||||
|
EXEC: Normal CPU execution.
|
||||||
|
|
||||||
|
PREP: Preparation phase before committing the hardware to idle mode
|
||||||
|
like cache flushing. This is abortable on pending wake-up
|
||||||
|
event conditions. The abort latency is assumed to be negligible
|
||||||
|
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
|
||||||
|
goes back to EXEC. This phase is optional. If not abortable,
|
||||||
|
this should be included in the ENTRY phase instead.
|
||||||
|
|
||||||
|
ENTRY: The hardware is committed to idle mode. This period must run
|
||||||
|
to completion up to IDLE before anything else can happen.
|
||||||
|
|
||||||
|
IDLE: This is the actual energy-saving idle period. This may last
|
||||||
|
between 0 and infinite time, until a wake-up event occurs.
|
||||||
|
|
||||||
|
EXIT: Period during which the CPU is brought back to operational
|
||||||
|
mode (EXEC).
|
||||||
|
|
||||||
|
entry-latency: Worst case latency required to enter the idle state. The
|
||||||
|
exit-latency may be guaranteed only after entry-latency has passed.
|
||||||
|
|
||||||
|
min-residency: Minimum period, including preparation and entry, for a given
|
||||||
|
idle state to be worthwhile energywise.
|
||||||
|
|
||||||
|
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
|
||||||
|
CPU being able to execute normal code again. If not specified, this is assumed
|
||||||
|
to be entry-latency + exit-latency.
|
||||||
|
|
||||||
|
These timing parameters can be used by an OS in different circumstances.
|
||||||
|
|
||||||
|
An idle CPU requires the expected min-residency time to select the most
|
||||||
|
appropriate idle state based on the expected expiry time of the next IRQ
|
||||||
|
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
|
||||||
|
|
||||||
|
An operating system scheduler may need to compute the shortest wake-up delay
|
||||||
|
for CPUs in the system by detecting how long will it take to get a CPU out
|
||||||
|
of an idle state, e.g.:
|
||||||
|
|
||||||
|
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
|
||||||
|
|
||||||
|
In other words, the scheduler can make its scheduling decision by selecting
|
||||||
|
(e.g. waking-up) the CPU with the shortest wake-up delay.
|
||||||
|
The wake-up delay must take into account the entry latency if that period
|
||||||
|
has not expired. The abortable nature of the PREP period can be ignored
|
||||||
|
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
|
||||||
|
the worst case since it depends on the CPU operating conditions, i.e. caches
|
||||||
|
state).
|
||||||
|
|
||||||
|
An OS has to reliably probe the wakeup-latency since some devices can enforce
|
||||||
|
latency constraint guarantees to work properly, so the OS has to detect the
|
||||||
|
worst case wake-up latency it can incur if a CPU is allowed to enter an
|
||||||
|
idle state, and possibly to prevent that to guarantee reliable device
|
||||||
|
functioning.
|
||||||
|
|
||||||
|
The min-residency time parameter deserves further explanation since it is
|
||||||
|
expressed in time units but must factor in energy consumption coefficients.
|
||||||
|
|
||||||
|
The energy consumption of a cpu when it enters a power state can be roughly
|
||||||
|
characterised by the following graph:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
e |
|
||||||
|
n | /---
|
||||||
|
e | /------
|
||||||
|
r | /------
|
||||||
|
g | /-----
|
||||||
|
y | /------
|
||||||
|
| ----
|
||||||
|
| /|
|
||||||
|
| / |
|
||||||
|
| / |
|
||||||
|
| / |
|
||||||
|
| / |
|
||||||
|
| / |
|
||||||
|
|/ |
|
||||||
|
-----|-------+----------------------------------
|
||||||
|
0| 1 time(ms)
|
||||||
|
|
||||||
|
Graph 1: Energy vs time example
|
||||||
|
|
||||||
|
The graph is split in two parts delimited by time 1ms on the X-axis.
|
||||||
|
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
|
||||||
|
and denotes the energy costs incurred while entering and leaving the idle
|
||||||
|
state.
|
||||||
|
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
|
||||||
|
shallower slope and essentially represents the energy consumption of the idle
|
||||||
|
state.
|
||||||
|
|
||||||
|
min-residency is defined for a given idle state as the minimum expected
|
||||||
|
residency time for a state (inclusive of preparation and entry) after
|
||||||
|
which choosing that state become the most energy efficient option. A good
|
||||||
|
way to visualise this, is by taking the same graph above and comparing some
|
||||||
|
states energy consumptions plots.
|
||||||
|
|
||||||
|
For sake of simplicity, let's consider a system with two idle states IDLE1,
|
||||||
|
and IDLE2:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
|
|
||||||
|
| /-- IDLE1
|
||||||
|
e | /---
|
||||||
|
n | /----
|
||||||
|
e | /---
|
||||||
|
r | /-----/--------- IDLE2
|
||||||
|
g | /-------/---------
|
||||||
|
y | ------------ /---|
|
||||||
|
| / /---- |
|
||||||
|
| / /--- |
|
||||||
|
| / /---- |
|
||||||
|
| / /--- |
|
||||||
|
| --- |
|
||||||
|
| / |
|
||||||
|
| / |
|
||||||
|
|/ | time
|
||||||
|
---/----------------------------+------------------------
|
||||||
|
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
||||||
|
|
|
||||||
|
IDLE2-min-residency
|
||||||
|
|
||||||
|
Graph 2: idle states min-residency example
|
||||||
|
|
||||||
|
In graph 2 above, that takes into account idle states entry/exit energy
|
||||||
|
costs, it is clear that if the idle state residency time (i.e. time till next
|
||||||
|
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
|
||||||
|
choice energywise.
|
||||||
|
|
||||||
|
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
|
||||||
|
than IDLE2.
|
||||||
|
|
||||||
|
However, the lower power consumption (i.e. shallower energy curve slope) of
|
||||||
|
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
|
||||||
|
efficient.
|
||||||
|
|
||||||
|
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
|
||||||
|
shallower states in a system with multiple idle states) is defined
|
||||||
|
IDLE2-min-residency and corresponds to the time when energy consumption of
|
||||||
|
IDLE1 and IDLE2 states breaks even.
|
||||||
|
|
||||||
|
The definitions provided in this section underpin the idle states
|
||||||
|
properties specification that is the subject of the following sections.
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
3 - idle-states node
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
ARM processor idle states are defined within the idle-states node, which is
|
||||||
|
a direct child of the cpus node [1] and provides a container where the
|
||||||
|
processor idle states, defined as device tree nodes, are listed.
|
||||||
|
|
||||||
|
On ARM systems, it is a container of processor idle states nodes. If the
|
||||||
|
system does not provide CPU power management capabilities, or the processor
|
||||||
|
just supports idle_standby, an idle-states node is not required.
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
4 - References
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
[1] ARM Linux Kernel documentation - CPUs bindings
|
||||||
|
Documentation/devicetree/bindings/arm/cpus.yaml
|
||||||
|
|
||||||
|
[2] ARM Linux Kernel documentation - PSCI bindings
|
||||||
|
Documentation/devicetree/bindings/arm/psci.yaml
|
||||||
|
|
||||||
|
[3] ARM Server Base System Architecture (SBSA)
|
||||||
|
http://infocenter.arm.com/help/index.jsp
|
||||||
|
|
||||||
|
[4] ARM Architecture Reference Manuals
|
||||||
|
http://infocenter.arm.com/help/index.jsp
|
||||||
|
|
||||||
|
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
|
||||||
|
Documentation/arm64/booting.rst
|
||||||
|
|
||||||
|
properties:
|
||||||
|
$nodename:
|
||||||
|
const: idle-states
|
||||||
|
|
||||||
|
entry-method:
|
||||||
|
description: |
|
||||||
|
Usage and definition depend on ARM architecture version.
|
||||||
|
|
||||||
|
On ARM v8 64-bit this property is required.
|
||||||
|
On ARM 32-bit systems this property is optional
|
||||||
|
|
||||||
|
This assumes that the "enable-method" property is set to "psci" in the cpu
|
||||||
|
node[6] that is responsible for setting up CPU idle management in the OS
|
||||||
|
implementation.
|
||||||
|
const: psci
|
||||||
|
|
||||||
|
patternProperties:
|
||||||
|
"^(cpu|cluster)-":
|
||||||
|
type: object
|
||||||
|
description: |
|
||||||
|
Each state node represents an idle state description and must be defined
|
||||||
|
as follows.
|
||||||
|
|
||||||
|
The idle state entered by executing the wfi instruction (idle_standby
|
||||||
|
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
|
||||||
|
must not be listed.
|
||||||
|
|
||||||
|
In addition to the properties listed above, a state node may require
|
||||||
|
additional properties specific to the entry-method defined in the
|
||||||
|
idle-states node. Please refer to the entry-method bindings
|
||||||
|
documentation for properties definitions.
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
const: arm,idle-state
|
||||||
|
|
||||||
|
local-timer-stop:
|
||||||
|
description:
|
||||||
|
If present the CPU local timer control logic is
|
||||||
|
lost on state entry, otherwise it is retained.
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
entry-latency-us:
|
||||||
|
description:
|
||||||
|
Worst case latency in microseconds required to enter the idle state.
|
||||||
|
|
||||||
|
exit-latency-us:
|
||||||
|
description:
|
||||||
|
Worst case latency in microseconds required to exit the idle state.
|
||||||
|
The exit-latency-us duration may be guaranteed only after
|
||||||
|
entry-latency-us has passed.
|
||||||
|
|
||||||
|
min-residency-us:
|
||||||
|
description:
|
||||||
|
Minimum residency duration in microseconds, inclusive of preparation
|
||||||
|
and entry, for this idle state to be considered worthwhile energy wise
|
||||||
|
(refer to section 2 of this document for a complete description).
|
||||||
|
|
||||||
|
wakeup-latency-us:
|
||||||
|
description: |
|
||||||
|
Maximum delay between the signaling of a wake-up event and the CPU
|
||||||
|
being able to execute normal code again. If omitted, this is assumed
|
||||||
|
to be equal to:
|
||||||
|
|
||||||
|
entry-latency-us + exit-latency-us
|
||||||
|
|
||||||
|
It is important to supply this value on systems where the duration of
|
||||||
|
PREP phase (see diagram 1, section 2) is non-neglibigle. In such
|
||||||
|
systems entry-latency-us + exit-latency-us will exceed
|
||||||
|
wakeup-latency-us by this duration.
|
||||||
|
|
||||||
|
idle-state-name:
|
||||||
|
$ref: /schemas/types.yaml#definitions/string
|
||||||
|
description:
|
||||||
|
A string used as a descriptive name for the idle state.
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- entry-latency-us
|
||||||
|
- exit-latency-us
|
||||||
|
- min-residency-us
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
|
||||||
|
cpus {
|
||||||
|
#size-cells = <0>;
|
||||||
|
#address-cells = <2>;
|
||||||
|
|
||||||
|
cpu@0 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x0>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@1 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x1>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x100>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@101 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x101>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@10000 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x10000>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@10001 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x10001>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@10100 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x10100>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@10101 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57";
|
||||||
|
reg = <0x0 0x10101>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||||
|
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100000000 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x0>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100000001 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x1>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100000100 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x100>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100000101 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x101>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100010000 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x10000>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100010001 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x10001>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100010100 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x10100>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100010101 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53";
|
||||||
|
reg = <0x1 0x10101>;
|
||||||
|
enable-method = "psci";
|
||||||
|
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||||
|
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
idle-states {
|
||||||
|
entry-method = "psci";
|
||||||
|
|
||||||
|
CPU_RETENTION_0_0: cpu-retention-0-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
arm,psci-suspend-param = <0x0010000>;
|
||||||
|
entry-latency-us = <20>;
|
||||||
|
exit-latency-us = <40>;
|
||||||
|
min-residency-us = <80>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_RETENTION_0: cluster-retention-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x1010000>;
|
||||||
|
entry-latency-us = <50>;
|
||||||
|
exit-latency-us = <100>;
|
||||||
|
min-residency-us = <250>;
|
||||||
|
wakeup-latency-us = <130>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x0010000>;
|
||||||
|
entry-latency-us = <250>;
|
||||||
|
exit-latency-us = <500>;
|
||||||
|
min-residency-us = <950>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x1010000>;
|
||||||
|
entry-latency-us = <600>;
|
||||||
|
exit-latency-us = <1100>;
|
||||||
|
min-residency-us = <2700>;
|
||||||
|
wakeup-latency-us = <1500>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CPU_RETENTION_1_0: cpu-retention-1-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
arm,psci-suspend-param = <0x0010000>;
|
||||||
|
entry-latency-us = <20>;
|
||||||
|
exit-latency-us = <40>;
|
||||||
|
min-residency-us = <90>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_RETENTION_1: cluster-retention-1 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x1010000>;
|
||||||
|
entry-latency-us = <50>;
|
||||||
|
exit-latency-us = <100>;
|
||||||
|
min-residency-us = <270>;
|
||||||
|
wakeup-latency-us = <100>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x0010000>;
|
||||||
|
entry-latency-us = <70>;
|
||||||
|
exit-latency-us = <100>;
|
||||||
|
min-residency-us = <300>;
|
||||||
|
wakeup-latency-us = <150>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
arm,psci-suspend-param = <0x1010000>;
|
||||||
|
entry-latency-us = <500>;
|
||||||
|
exit-latency-us = <1200>;
|
||||||
|
min-residency-us = <3500>;
|
||||||
|
wakeup-latency-us = <1300>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
- |
|
||||||
|
// Example 2 (ARM 32-bit, 8-cpu system, two clusters):
|
||||||
|
|
||||||
|
cpus {
|
||||||
|
#size-cells = <0>;
|
||||||
|
#address-cells = <1>;
|
||||||
|
|
||||||
|
cpu@0 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a15";
|
||||||
|
reg = <0x0>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@1 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a15";
|
||||||
|
reg = <0x1>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@2 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a15";
|
||||||
|
reg = <0x2>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@3 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a15";
|
||||||
|
reg = <0x3>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@100 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a7";
|
||||||
|
reg = <0x100>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@101 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a7";
|
||||||
|
reg = <0x101>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@102 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a7";
|
||||||
|
reg = <0x102>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu@103 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a7";
|
||||||
|
reg = <0x103>;
|
||||||
|
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
idle-states {
|
||||||
|
cpu_sleep_0_0: cpu-sleep-0-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
entry-latency-us = <200>;
|
||||||
|
exit-latency-us = <100>;
|
||||||
|
min-residency-us = <400>;
|
||||||
|
wakeup-latency-us = <250>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cluster_sleep_0: cluster-sleep-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
entry-latency-us = <500>;
|
||||||
|
exit-latency-us = <1500>;
|
||||||
|
min-residency-us = <2500>;
|
||||||
|
wakeup-latency-us = <1700>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cpu_sleep_1_0: cpu-sleep-1-0 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
entry-latency-us = <300>;
|
||||||
|
exit-latency-us = <500>;
|
||||||
|
min-residency-us = <900>;
|
||||||
|
wakeup-latency-us = <600>;
|
||||||
|
};
|
||||||
|
|
||||||
|
cluster_sleep_1: cluster-sleep-1 {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
local-timer-stop;
|
||||||
|
entry-latency-us = <800>;
|
||||||
|
exit-latency-us = <2000>;
|
||||||
|
min-residency-us = <6500>;
|
||||||
|
wakeup-latency-us = <2300>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -47,7 +47,7 @@ examples:
|
|||||||
- |
|
- |
|
||||||
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||||
|
|
||||||
cache-controller@1100000 {
|
system-cache-controller@1100000 {
|
||||||
compatible = "qcom,sdm845-llcc";
|
compatible = "qcom,sdm845-llcc";
|
||||||
reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
|
reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
|
||||||
reg-names = "llcc_base", "llcc_broadcast_base";
|
reg-names = "llcc_base", "llcc_broadcast_base";
|
||||||
|
@@ -102,6 +102,34 @@ properties:
|
|||||||
[1] Kernel documentation - ARM idle states bindings
|
[1] Kernel documentation - ARM idle states bindings
|
||||||
Documentation/devicetree/bindings/arm/idle-states.txt
|
Documentation/devicetree/bindings/arm/idle-states.txt
|
||||||
|
|
||||||
|
"#power-domain-cells":
|
||||||
|
description:
|
||||||
|
The number of cells in a PM domain specifier as per binding in [3].
|
||||||
|
Must be 0 as to represent a single PM domain.
|
||||||
|
|
||||||
|
ARM systems can have multiple cores, sometimes in an hierarchical
|
||||||
|
arrangement. This often, but not always, maps directly to the processor
|
||||||
|
power topology of the system. Individual nodes in a topology have their
|
||||||
|
own specific power states and can be better represented hierarchically.
|
||||||
|
|
||||||
|
For these cases, the definitions of the idle states for the CPUs and the
|
||||||
|
CPU topology, must conform to the binding in [3]. The idle states
|
||||||
|
themselves must conform to the binding in [4] and must specify the
|
||||||
|
arm,psci-suspend-param property.
|
||||||
|
|
||||||
|
It should also be noted that, in PSCI firmware v1.0 the OS-Initiated
|
||||||
|
(OSI) CPU suspend mode is introduced. Using a hierarchical representation
|
||||||
|
helps to implement support for OSI mode and OS implementations may choose
|
||||||
|
to mandate it.
|
||||||
|
|
||||||
|
[3] Documentation/devicetree/bindings/power/power_domain.txt
|
||||||
|
[4] Documentation/devicetree/bindings/power/domain-idle-state.txt
|
||||||
|
|
||||||
|
power-domains:
|
||||||
|
$ref: '/schemas/types.yaml#/definitions/phandle-array'
|
||||||
|
description:
|
||||||
|
List of phandles and PM domain specifiers, as defined by bindings of the
|
||||||
|
PM domain provider.
|
||||||
|
|
||||||
required:
|
required:
|
||||||
- compatible
|
- compatible
|
||||||
@@ -160,4 +188,80 @@ examples:
|
|||||||
cpu_on = <0x95c10002>;
|
cpu_on = <0x95c10002>;
|
||||||
cpu_off = <0x95c10001>;
|
cpu_off = <0x95c10001>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
- |+
|
||||||
|
|
||||||
|
// Case 4: CPUs and CPU idle states described using the hierarchical model.
|
||||||
|
|
||||||
|
cpus {
|
||||||
|
#size-cells = <0>;
|
||||||
|
#address-cells = <1>;
|
||||||
|
|
||||||
|
CPU0: cpu@0 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a53", "arm,armv8";
|
||||||
|
reg = <0x0>;
|
||||||
|
enable-method = "psci";
|
||||||
|
power-domains = <&CPU_PD0>;
|
||||||
|
power-domain-names = "psci";
|
||||||
|
};
|
||||||
|
|
||||||
|
CPU1: cpu@1 {
|
||||||
|
device_type = "cpu";
|
||||||
|
compatible = "arm,cortex-a57", "arm,armv8";
|
||||||
|
reg = <0x100>;
|
||||||
|
enable-method = "psci";
|
||||||
|
power-domains = <&CPU_PD1>;
|
||||||
|
power-domain-names = "psci";
|
||||||
|
};
|
||||||
|
|
||||||
|
idle-states {
|
||||||
|
|
||||||
|
CPU_PWRDN: cpu-power-down {
|
||||||
|
compatible = "arm,idle-state";
|
||||||
|
arm,psci-suspend-param = <0x0000001>;
|
||||||
|
entry-latency-us = <10>;
|
||||||
|
exit-latency-us = <10>;
|
||||||
|
min-residency-us = <100>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_RET: cluster-retention {
|
||||||
|
compatible = "domain-idle-state";
|
||||||
|
arm,psci-suspend-param = <0x1000011>;
|
||||||
|
entry-latency-us = <500>;
|
||||||
|
exit-latency-us = <500>;
|
||||||
|
min-residency-us = <2000>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_PWRDN: cluster-power-down {
|
||||||
|
compatible = "domain-idle-state";
|
||||||
|
arm,psci-suspend-param = <0x1000031>;
|
||||||
|
entry-latency-us = <2000>;
|
||||||
|
exit-latency-us = <2000>;
|
||||||
|
min-residency-us = <6000>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
psci {
|
||||||
|
compatible = "arm,psci-1.0";
|
||||||
|
method = "smc";
|
||||||
|
|
||||||
|
CPU_PD0: cpu-pd0 {
|
||||||
|
#power-domain-cells = <0>;
|
||||||
|
domain-idle-states = <&CPU_PWRDN>;
|
||||||
|
power-domains = <&CLUSTER_PD>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CPU_PD1: cpu-pd1 {
|
||||||
|
#power-domain-cells = <0>;
|
||||||
|
domain-idle-states = <&CPU_PWRDN>;
|
||||||
|
power-domains = <&CLUSTER_PD>;
|
||||||
|
};
|
||||||
|
|
||||||
|
CLUSTER_PD: cluster-pd {
|
||||||
|
#power-domain-cells = <0>;
|
||||||
|
domain-idle-states = <&CLUSTER_RET>, <&CLUSTER_PWRDN>;
|
||||||
|
};
|
||||||
|
};
|
||||||
...
|
...
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: GPL-2.0
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
%YAML 1.2
|
%YAML 1.2
|
||||||
---
|
---
|
||||||
$id: http://devicetree.org/schemas/bindings/arm/qcom.yaml#
|
$id: http://devicetree.org/schemas/arm/qcom.yaml#
|
||||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
title: QCOM device tree bindings
|
title: QCOM device tree bindings
|
||||||
@@ -24,28 +24,30 @@ description: |
|
|||||||
|
|
||||||
The 'SoC' element must be one of the following strings:
|
The 'SoC' element must be one of the following strings:
|
||||||
|
|
||||||
apq8016
|
apq8016
|
||||||
apq8074
|
apq8074
|
||||||
apq8084
|
apq8084
|
||||||
apq8096
|
apq8096
|
||||||
msm8916
|
ipq8074
|
||||||
msm8974
|
mdm9615
|
||||||
msm8992
|
msm8916
|
||||||
msm8994
|
msm8974
|
||||||
msm8996
|
msm8992
|
||||||
mdm9615
|
msm8994
|
||||||
ipq8074
|
msm8996
|
||||||
sdm845
|
sc7180
|
||||||
|
sdm845
|
||||||
|
|
||||||
The 'board' element must be one of the following strings:
|
The 'board' element must be one of the following strings:
|
||||||
|
|
||||||
cdp
|
cdp
|
||||||
liquid
|
dragonboard
|
||||||
dragonboard
|
hk01
|
||||||
mtp
|
idp
|
||||||
sbc
|
liquid
|
||||||
hk01
|
mtp
|
||||||
qrd
|
qrd
|
||||||
|
sbc
|
||||||
|
|
||||||
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
|
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
|
||||||
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
|
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
|
||||||
@@ -144,4 +146,8 @@ properties:
|
|||||||
- qcom,ipq8074-hk01
|
- qcom,ipq8074-hk01
|
||||||
- const: qcom,ipq8074
|
- const: qcom,ipq8074
|
||||||
|
|
||||||
|
- items:
|
||||||
|
- enum:
|
||||||
|
- qcom,sc7180-idp
|
||||||
|
- const: qcom,sc7180
|
||||||
...
|
...
|
||||||
|
@@ -409,6 +409,9 @@ properties:
|
|||||||
|
|
||||||
- description: Pine64 RockPro64
|
- description: Pine64 RockPro64
|
||||||
items:
|
items:
|
||||||
|
- enum:
|
||||||
|
- pine64,rockpro64-v2.1
|
||||||
|
- pine64,rockpro64-v2.0
|
||||||
- const: pine64,rockpro64
|
- const: pine64,rockpro64
|
||||||
- const: rockchip,rk3399
|
- const: rockchip,rk3399
|
||||||
|
|
||||||
@@ -422,6 +425,12 @@ properties:
|
|||||||
- const: radxa,rockpi4
|
- const: radxa,rockpi4
|
||||||
- const: rockchip,rk3399
|
- const: rockchip,rk3399
|
||||||
|
|
||||||
|
- description: Radxa ROCK Pi N10
|
||||||
|
items:
|
||||||
|
- const: radxa,rockpi-n10
|
||||||
|
- const: vamrs,rk3399pro-vmarc-som
|
||||||
|
- const: rockchip,rk3399pro
|
||||||
|
|
||||||
- description: Radxa Rock2 Square
|
- description: Radxa Rock2 Square
|
||||||
items:
|
items:
|
||||||
- const: radxa,rock2-square
|
- const: radxa,rock2-square
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
# Copyright 2019 Unisoc Inc.
|
# Copyright 2019 Unisoc Inc.
|
||||||
%YAML 1.2
|
%YAML 1.2
|
||||||
---
|
---
|
||||||
$id: http://devicetree.org/schemas/arm/sprd.yaml#
|
$id: http://devicetree.org/schemas/arm/sprd/sprd.yaml#
|
||||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
title: Unisoc platforms device tree bindings
|
title: Unisoc platforms device tree bindings
|
@@ -1,37 +0,0 @@
|
|||||||
ML-AHB interconnect bindings
|
|
||||||
|
|
||||||
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
|
|
||||||
a Cortex-M subsystem with dedicated memories.
|
|
||||||
The MCU SRAM and RETRAM memory parts can be accessed through different addresses
|
|
||||||
(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
|
|
||||||
Cortex-M firmware accesses among those ports allows to tune the system
|
|
||||||
performance.
|
|
||||||
|
|
||||||
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
|
|
||||||
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
|
|
||||||
|
|
||||||
Required properties:
|
|
||||||
- compatible: should be "simple-bus"
|
|
||||||
- dma-ranges: describes memory addresses translation between the local CPU and
|
|
||||||
the remote Cortex-M processor. Each memory region, is declared with
|
|
||||||
3 parameters:
|
|
||||||
- param 1: device base address (Cortex-M processor address)
|
|
||||||
- param 2: physical base address (local CPU address)
|
|
||||||
- param 3: size of the memory region.
|
|
||||||
|
|
||||||
The Cortex-M remote processor accessed via the mlahb interconnect is described
|
|
||||||
by a child node.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
mlahb {
|
|
||||||
compatible = "simple-bus";
|
|
||||||
#address-cells = <1>;
|
|
||||||
#size-cells = <1>;
|
|
||||||
dma-ranges = <0x00000000 0x38000000 0x10000>,
|
|
||||||
<0x10000000 0x10000000 0x60000>,
|
|
||||||
<0x30000000 0x30000000 0x60000>;
|
|
||||||
|
|
||||||
m4_rproc: m4@10000000 {
|
|
||||||
...
|
|
||||||
};
|
|
||||||
};
|
|
70
Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
Normal file
70
Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: "http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml#"
|
||||||
|
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
|
||||||
|
|
||||||
|
title: STMicroelectronics STM32 ML-AHB interconnect bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Fabien Dessenne <fabien.dessenne@st.com>
|
||||||
|
- Arnaud Pouliquen <arnaud.pouliquen@st.com>
|
||||||
|
|
||||||
|
description: |
|
||||||
|
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
|
||||||
|
a Cortex-M subsystem with dedicated memories. The MCU SRAM and RETRAM memory
|
||||||
|
parts can be accessed through different addresses (see "RAM aliases" in [1])
|
||||||
|
using different buses (see [2]): balancing the Cortex-M firmware accesses
|
||||||
|
among those ports allows to tune the system performance.
|
||||||
|
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
|
||||||
|
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
|
||||||
|
|
||||||
|
allOf:
|
||||||
|
- $ref: /schemas/simple-bus.yaml#
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
contains:
|
||||||
|
enum:
|
||||||
|
- st,mlahb
|
||||||
|
|
||||||
|
dma-ranges:
|
||||||
|
description: |
|
||||||
|
Describe memory addresses translation between the local CPU and the
|
||||||
|
remote Cortex-M processor. Each memory region, is declared with
|
||||||
|
3 parameters:
|
||||||
|
- param 1: device base address (Cortex-M processor address)
|
||||||
|
- param 2: physical base address (local CPU address)
|
||||||
|
- param 3: size of the memory region.
|
||||||
|
maxItems: 3
|
||||||
|
|
||||||
|
'#address-cells':
|
||||||
|
const: 1
|
||||||
|
|
||||||
|
'#size-cells':
|
||||||
|
const: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- '#address-cells'
|
||||||
|
- '#size-cells'
|
||||||
|
- dma-ranges
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
mlahb: ahb {
|
||||||
|
compatible = "st,mlahb", "simple-bus";
|
||||||
|
#address-cells = <1>;
|
||||||
|
#size-cells = <1>;
|
||||||
|
reg = <0x10000000 0x40000>;
|
||||||
|
ranges;
|
||||||
|
dma-ranges = <0x00000000 0x38000000 0x10000>,
|
||||||
|
<0x10000000 0x10000000 0x60000>,
|
||||||
|
<0x30000000 0x30000000 0x60000>;
|
||||||
|
|
||||||
|
m4_rproc: m4@10000000 {
|
||||||
|
reg = <0x10000000 0x40000>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,41 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: "http://devicetree.org/schemas/arm/stm32/st,stm32-syscon.yaml#"
|
||||||
|
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
|
||||||
|
|
||||||
|
title: STMicroelectronics STM32 Platforms System Controller bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Alexandre Torgue <alexandre.torgue@st.com>
|
||||||
|
- Christophe Roullier <christophe.roullier@st.com>
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
oneOf:
|
||||||
|
- items:
|
||||||
|
- enum:
|
||||||
|
- st,stm32mp157-syscfg
|
||||||
|
- const: syscon
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
#include <dt-bindings/clock/stm32mp1-clks.h>
|
||||||
|
syscfg: syscon@50020000 {
|
||||||
|
compatible = "st,stm32mp157-syscfg", "syscon";
|
||||||
|
reg = <0x50020000 0x400>;
|
||||||
|
clocks = <&rcc SYSCFG>;
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -1,16 +0,0 @@
|
|||||||
STMicroelectronics STM32 Platforms System Controller
|
|
||||||
|
|
||||||
Properties:
|
|
||||||
- compatible : should contain two values. First value must be :
|
|
||||||
- " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
|
|
||||||
second value must be always "syscon".
|
|
||||||
- reg : offset and length of the register set.
|
|
||||||
- clocks: phandle to the syscfg clock
|
|
||||||
|
|
||||||
Example:
|
|
||||||
syscfg: syscon@50020000 {
|
|
||||||
compatible = "st,stm32mp157-syscfg", "syscon";
|
|
||||||
reg = <0x50020000 0x400>;
|
|
||||||
clocks = <&rcc SYSCFG>;
|
|
||||||
};
|
|
||||||
|
|
@@ -342,6 +342,16 @@ properties:
|
|||||||
- const: libretech,all-h3-cc-h5
|
- const: libretech,all-h3-cc-h5
|
||||||
- const: allwinner,sun50i-h5
|
- const: allwinner,sun50i-h5
|
||||||
|
|
||||||
|
- description: Libre Computer Board ALL-H3-IT H5
|
||||||
|
items:
|
||||||
|
- const: libretech,all-h3-it-h5
|
||||||
|
- const: allwinner,sun50i-h5
|
||||||
|
|
||||||
|
- description: Libre Computer Board ALL-H5-CC H5
|
||||||
|
items:
|
||||||
|
- const: libretech,all-h5-cc-h5
|
||||||
|
- const: allwinner,sun50i-h5
|
||||||
|
|
||||||
- description: Lichee Pi One
|
- description: Lichee Pi One
|
||||||
items:
|
items:
|
||||||
- const: licheepi,licheepi-one
|
- const: licheepi,licheepi-one
|
||||||
@@ -470,6 +480,12 @@ properties:
|
|||||||
- const: emlid,neutis-n5
|
- const: emlid,neutis-n5
|
||||||
- const: allwinner,sun50i-h5
|
- const: allwinner,sun50i-h5
|
||||||
|
|
||||||
|
- description: Emlid Neutis N5H3 Developper Board
|
||||||
|
items:
|
||||||
|
- const: emlid,neutis-n5h3-devboard
|
||||||
|
- const: emlid,neutis-n5h3
|
||||||
|
- const: allwinner,sun8i-h3
|
||||||
|
|
||||||
- description: NextThing Co. CHIP
|
- description: NextThing Co. CHIP
|
||||||
items:
|
items:
|
||||||
- const: nextthing,chip
|
- const: nextthing,chip
|
||||||
@@ -599,11 +615,16 @@ properties:
|
|||||||
- const: pine64,pine64-plus
|
- const: pine64,pine64-plus
|
||||||
- const: allwinner,sun50i-a64
|
- const: allwinner,sun50i-a64
|
||||||
|
|
||||||
- description: Pine64 PineH64
|
- description: Pine64 PineH64 model A
|
||||||
items:
|
items:
|
||||||
- const: pine64,pine-h64
|
- const: pine64,pine-h64
|
||||||
- const: allwinner,sun50i-h6
|
- const: allwinner,sun50i-h6
|
||||||
|
|
||||||
|
- description: Pine64 PineH64 model B
|
||||||
|
items:
|
||||||
|
- const: pine64,pine-h64-model-b
|
||||||
|
- const: allwinner,sun50i-h6
|
||||||
|
|
||||||
- description: Pine64 LTS
|
- description: Pine64 LTS
|
||||||
items:
|
items:
|
||||||
- const: pine64,pine64-lts
|
- const: pine64,pine64-lts
|
||||||
|
@@ -0,0 +1,65 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun4i-a10-mbus.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner Memory Bus (MBUS) controller
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
description: |
|
||||||
|
The MBUS controller drives the MBUS that other devices in the SoC
|
||||||
|
will use to perform DMA. It also has a register interface that
|
||||||
|
allows to monitor and control the bandwidth and priorities for
|
||||||
|
masters on that bus.
|
||||||
|
|
||||||
|
Each device having to perform their DMA through the MBUS must have
|
||||||
|
the interconnects and interconnect-names properties set to the MBUS
|
||||||
|
controller and with "dma-mem" as the interconnect name.
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#interconnect-cells":
|
||||||
|
const: 1
|
||||||
|
description:
|
||||||
|
The content of the cell is the MBUS ID.
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
enum:
|
||||||
|
- allwinner,sun5i-a13-mbus
|
||||||
|
- allwinner,sun8i-h3-mbus
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
dma-ranges:
|
||||||
|
description:
|
||||||
|
See section 2.3.9 of the DeviceTree Specification.
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#interconnect-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- dma-ranges
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
#include <dt-bindings/clock/sun5i-ccu.h>
|
||||||
|
|
||||||
|
mbus: dram-controller@1c01000 {
|
||||||
|
compatible = "allwinner,sun5i-a13-mbus";
|
||||||
|
reg = <0x01c01000 0x1000>;
|
||||||
|
clocks = <&ccu CLK_MBUS>;
|
||||||
|
dma-ranges = <0x00000000 0x40000000 0x20000000>;
|
||||||
|
#interconnect-cells = <1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -1,37 +0,0 @@
|
|||||||
Allwinner Memory Bus (MBUS) controller
|
|
||||||
|
|
||||||
The MBUS controller drives the MBUS that other devices in the SoC will
|
|
||||||
use to perform DMA. It also has a register interface that allows to
|
|
||||||
monitor and control the bandwidth and priorities for masters on that
|
|
||||||
bus.
|
|
||||||
|
|
||||||
Required properties:
|
|
||||||
- compatible: Must be one of:
|
|
||||||
- allwinner,sun5i-a13-mbus
|
|
||||||
- allwinner,sun8i-h3-mbus
|
|
||||||
- reg: Offset and length of the register set for the controller
|
|
||||||
- clocks: phandle to the clock driving the controller
|
|
||||||
- dma-ranges: See section 2.3.9 of the DeviceTree Specification
|
|
||||||
- #interconnect-cells: Must be one, with the argument being the MBUS
|
|
||||||
port ID
|
|
||||||
|
|
||||||
Each device having to perform their DMA through the MBUS must have the
|
|
||||||
interconnects and interconnect-names properties set to the MBUS
|
|
||||||
controller and with "dma-mem" as the interconnect name.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
mbus: dram-controller@1c01000 {
|
|
||||||
compatible = "allwinner,sun5i-a13-mbus";
|
|
||||||
reg = <0x01c01000 0x1000>;
|
|
||||||
clocks = <&ccu CLK_MBUS>;
|
|
||||||
dma-ranges = <0x00000000 0x40000000 0x20000000>;
|
|
||||||
#interconnect-cells = <1>;
|
|
||||||
};
|
|
||||||
|
|
||||||
fe0: display-frontend@1e00000 {
|
|
||||||
compatible = "allwinner,sun5i-a13-display-frontend";
|
|
||||||
...
|
|
||||||
interconnects = <&mbus 19>;
|
|
||||||
interconnect-names = "dma-mem";
|
|
||||||
};
|
|
36
Documentation/devicetree/bindings/arm/ux500.yaml
Normal file
36
Documentation/devicetree/bindings/arm/ux500.yaml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/arm/ux500.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Ux500 platforms device tree bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Linus Walleij <linus.walleij@linaro.org>
|
||||||
|
|
||||||
|
properties:
|
||||||
|
$nodename:
|
||||||
|
const: '/'
|
||||||
|
compatible:
|
||||||
|
oneOf:
|
||||||
|
|
||||||
|
- description: ST-Ericsson HREF (pre-v60)
|
||||||
|
items:
|
||||||
|
- const: st-ericsson,mop500
|
||||||
|
- const: st-ericsson,u8500
|
||||||
|
|
||||||
|
- description: ST-Ericsson HREF (v60+)
|
||||||
|
items:
|
||||||
|
- const: st-ericsson,hrefv60+
|
||||||
|
- const: st-ericsson,u8500
|
||||||
|
|
||||||
|
- description: Calao Systems Snowball
|
||||||
|
items:
|
||||||
|
- const: calaosystems,snowball-a9500
|
||||||
|
- const: st-ericsson,u9500
|
||||||
|
|
||||||
|
- description: Samsung Galaxy S III mini (GT-I8190)
|
||||||
|
items:
|
||||||
|
- const: samsung,golden
|
||||||
|
- const: st-ericsson,u8500
|
@@ -9,8 +9,6 @@ PHYs.
|
|||||||
|
|
||||||
Required properties:
|
Required properties:
|
||||||
- compatible : compatible string, one of:
|
- compatible : compatible string, one of:
|
||||||
- "allwinner,sun4i-a10-ahci"
|
|
||||||
- "allwinner,sun8i-r40-ahci"
|
|
||||||
- "brcm,iproc-ahci"
|
- "brcm,iproc-ahci"
|
||||||
- "hisilicon,hisi-ahci"
|
- "hisilicon,hisi-ahci"
|
||||||
- "cavium,octeon-7130-ahci"
|
- "cavium,octeon-7130-ahci"
|
||||||
@@ -45,8 +43,6 @@ Required properties when using sub-nodes:
|
|||||||
- #address-cells : number of cells to encode an address
|
- #address-cells : number of cells to encode an address
|
||||||
- #size-cells : number of cells representing the size of an address
|
- #size-cells : number of cells representing the size of an address
|
||||||
|
|
||||||
For allwinner,sun8i-r40-ahci, the reset property must be present.
|
|
||||||
|
|
||||||
Sub-nodes required properties:
|
Sub-nodes required properties:
|
||||||
- reg : the port number
|
- reg : the port number
|
||||||
And at least one of the following properties:
|
And at least one of the following properties:
|
||||||
@@ -60,14 +56,6 @@ Examples:
|
|||||||
interrupts = <115>;
|
interrupts = <115>;
|
||||||
};
|
};
|
||||||
|
|
||||||
ahci: sata@1c18000 {
|
|
||||||
compatible = "allwinner,sun4i-a10-ahci";
|
|
||||||
reg = <0x01c18000 0x1000>;
|
|
||||||
interrupts = <56>;
|
|
||||||
clocks = <&pll6 0>, <&ahb_gates 25>;
|
|
||||||
target-supply = <®_ahci_5v>;
|
|
||||||
};
|
|
||||||
|
|
||||||
With sub-nodes:
|
With sub-nodes:
|
||||||
sata@f7e90000 {
|
sata@f7e90000 {
|
||||||
compatible = "marvell,berlin2q-achi", "generic-ahci";
|
compatible = "marvell,berlin2q-achi", "generic-ahci";
|
||||||
|
@@ -0,0 +1,47 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/ata/allwinner,sun4i-a10-ahci.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 AHCI SATA Controller bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
const: allwinner,sun4i-a10-ahci
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
items:
|
||||||
|
- description: AHCI Bus Clock
|
||||||
|
- description: AHCI Module Clock
|
||||||
|
|
||||||
|
interrupts:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
target-supply:
|
||||||
|
description: Regulator for SATA target power
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- interrupts
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
ahci: sata@1c18000 {
|
||||||
|
compatible = "allwinner,sun4i-a10-ahci";
|
||||||
|
reg = <0x01c18000 0x1000>;
|
||||||
|
interrupts = <56>;
|
||||||
|
clocks = <&pll6 0>, <&ahb_gates 25>;
|
||||||
|
target-supply = <®_ahci_5v>;
|
||||||
|
};
|
@@ -0,0 +1,67 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/ata/allwinner,sun8i-r40-ahci.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner R40 AHCI SATA Controller bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
const: allwinner,sun8i-r40-ahci
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
items:
|
||||||
|
- description: AHCI Bus Clock
|
||||||
|
- description: AHCI Module Clock
|
||||||
|
|
||||||
|
interrupts:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
resets:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
reset-names:
|
||||||
|
const: ahci
|
||||||
|
|
||||||
|
ahci-supply:
|
||||||
|
description: Regulator for the AHCI controller
|
||||||
|
|
||||||
|
phy-supply:
|
||||||
|
description: Regulator for the SATA PHY power
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- interrupts
|
||||||
|
- resets
|
||||||
|
- reset-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||||
|
#include <dt-bindings/clock/sun8i-r40-ccu.h>
|
||||||
|
#include <dt-bindings/reset/sun8i-r40-ccu.h>
|
||||||
|
|
||||||
|
ahci: sata@1c18000 {
|
||||||
|
compatible = "allwinner,sun8i-r40-ahci";
|
||||||
|
reg = <0x01c18000 0x1000>;
|
||||||
|
interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_HIGH>;
|
||||||
|
clocks = <&ccu CLK_BUS_SATA>, <&ccu CLK_SATA>;
|
||||||
|
resets = <&ccu RST_BUS_SATA>;
|
||||||
|
reset-names = "ahci";
|
||||||
|
ahci-supply = <®_dldo4>;
|
||||||
|
phy-supply = <®_eldo3>;
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -5,6 +5,7 @@ Each SATA controller should have its own node.
|
|||||||
|
|
||||||
Required properties:
|
Required properties:
|
||||||
- compatible : should be one or more of
|
- compatible : should be one or more of
|
||||||
|
"brcm,bcm7216-ahci"
|
||||||
"brcm,bcm7425-ahci"
|
"brcm,bcm7425-ahci"
|
||||||
"brcm,bcm7445-ahci"
|
"brcm,bcm7445-ahci"
|
||||||
"brcm,bcm-nsp-ahci"
|
"brcm,bcm-nsp-ahci"
|
||||||
@@ -14,6 +15,12 @@ Required properties:
|
|||||||
- reg-names : "ahci" and "top-ctrl"
|
- reg-names : "ahci" and "top-ctrl"
|
||||||
- interrupts : interrupt mapping for SATA IRQ
|
- interrupts : interrupt mapping for SATA IRQ
|
||||||
|
|
||||||
|
Optional properties:
|
||||||
|
|
||||||
|
- reset: for "brcm,bcm7216-ahci" must be a valid reset phandle
|
||||||
|
pointing to the RESCAL reset controller provider node.
|
||||||
|
- reset-names: for "brcm,bcm7216-ahci", must be "rescal".
|
||||||
|
|
||||||
Also see ahci-platform.txt.
|
Also see ahci-platform.txt.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@@ -1,38 +0,0 @@
|
|||||||
* Faraday Technology FTIDE010 PATA controller
|
|
||||||
|
|
||||||
This controller is the first Faraday IDE interface block, used in the
|
|
||||||
StorLink SL2312 and SL3516, later known as the Cortina Systems Gemini
|
|
||||||
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
|
|
||||||
(MWDM)modes 0 through 2 and Ultra DMA modes 0 through 6.
|
|
||||||
|
|
||||||
On the Gemini platform, this PATA block is accompanied by a PATA to
|
|
||||||
SATA bridge in order to support SATA. This is why a phandle to that
|
|
||||||
controller is compulsory on that platform.
|
|
||||||
|
|
||||||
The timing properties are unique per-SoC, not per-board.
|
|
||||||
|
|
||||||
Required properties:
|
|
||||||
- compatible: should be one of
|
|
||||||
"cortina,gemini-pata", "faraday,ftide010"
|
|
||||||
"faraday,ftide010"
|
|
||||||
- interrupts: interrupt for the block
|
|
||||||
- reg: registers and size for the block
|
|
||||||
|
|
||||||
Optional properties:
|
|
||||||
- clocks: a SoC clock running the peripheral.
|
|
||||||
- clock-names: should be set to "PCLK" for the peripheral clock.
|
|
||||||
|
|
||||||
Required properties for "cortina,gemini-pata" compatible:
|
|
||||||
- sata: a phande to the Gemini PATA to SATA bridge, see
|
|
||||||
cortina,gemini-sata-bridge.txt for details.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
ata@63000000 {
|
|
||||||
compatible = "cortina,gemini-pata", "faraday,ftide010";
|
|
||||||
reg = <0x63000000 0x100>;
|
|
||||||
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
|
|
||||||
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
|
|
||||||
clock-names = "PCLK";
|
|
||||||
sata = <&sata>;
|
|
||||||
};
|
|
89
Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
Normal file
89
Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/ata/faraday,ftide010.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Faraday Technology FTIDE010 PATA controller
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Linus Walleij <linus.walleij@linaro.org>
|
||||||
|
|
||||||
|
description: |
|
||||||
|
This controller is the first Faraday IDE interface block, used in the
|
||||||
|
StorLink SL3512 and SL3516, later known as the Cortina Systems Gemini
|
||||||
|
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
|
||||||
|
(MWDM) modes 0 through 2 and Ultra DMA modes 0 through 6.
|
||||||
|
|
||||||
|
On the Gemini platform, this PATA block is accompanied by a PATA to
|
||||||
|
SATA bridge in order to support SATA. This is why a phandle to that
|
||||||
|
controller is compulsory on that platform.
|
||||||
|
|
||||||
|
The timing properties are unique per-SoC, not per-board.
|
||||||
|
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
oneOf:
|
||||||
|
- const: faraday,ftide010
|
||||||
|
- items:
|
||||||
|
- const: cortina,gemini-pata
|
||||||
|
- const: faraday,ftide010
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
interrupts:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
minItems: 1
|
||||||
|
|
||||||
|
clock-names:
|
||||||
|
const: PCLK
|
||||||
|
|
||||||
|
sata:
|
||||||
|
description:
|
||||||
|
phandle to the Gemini PATA to SATA bridge, if available
|
||||||
|
$ref: /schemas/types.yaml#/definitions/phandle
|
||||||
|
|
||||||
|
required:
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- interrupts
|
||||||
|
|
||||||
|
allOf:
|
||||||
|
- $ref: pata-common.yaml#
|
||||||
|
|
||||||
|
- if:
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
contains:
|
||||||
|
const: cortina,gemini-pata
|
||||||
|
|
||||||
|
then:
|
||||||
|
required:
|
||||||
|
- sata
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
#include <dt-bindings/interrupt-controller/irq.h>
|
||||||
|
#include <dt-bindings/clock/cortina,gemini-clock.h>
|
||||||
|
|
||||||
|
ide@63000000 {
|
||||||
|
compatible = "cortina,gemini-pata", "faraday,ftide010";
|
||||||
|
reg = <0x63000000 0x100>;
|
||||||
|
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
|
||||||
|
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
|
||||||
|
clock-names = "PCLK";
|
||||||
|
sata = <&sata>;
|
||||||
|
#address-cells = <1>;
|
||||||
|
#size-cells = <0>;
|
||||||
|
ide-port@0 {
|
||||||
|
reg = <0>;
|
||||||
|
};
|
||||||
|
ide-port@1 {
|
||||||
|
reg = <1>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
50
Documentation/devicetree/bindings/ata/pata-common.yaml
Normal file
50
Documentation/devicetree/bindings/ata/pata-common.yaml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/ata/pata-common.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Common Properties for Parallel AT attachment (PATA) controllers
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Linus Walleij <linus.walleij@linaro.org>
|
||||||
|
|
||||||
|
description: |
|
||||||
|
This document defines device tree properties common to most Parallel
|
||||||
|
ATA (PATA, also known as IDE) AT attachment storage devices.
|
||||||
|
It doesn't constitue a device tree binding specification by itself but is
|
||||||
|
meant to be referenced by device tree bindings.
|
||||||
|
|
||||||
|
The PATA (IDE) controller-specific device tree bindings are responsible for
|
||||||
|
defining whether each property is required or optional.
|
||||||
|
|
||||||
|
properties:
|
||||||
|
$nodename:
|
||||||
|
pattern: "^ide(@.*)?$"
|
||||||
|
description:
|
||||||
|
Specifies the host controller node. PATA host controller nodes are named
|
||||||
|
"ide".
|
||||||
|
|
||||||
|
"#address-cells":
|
||||||
|
const: 1
|
||||||
|
|
||||||
|
"#size-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
patternProperties:
|
||||||
|
"^ide-port@[0-1]$":
|
||||||
|
description: |
|
||||||
|
DT nodes for ports connected on the PATA host. The master drive will have
|
||||||
|
ID number 0 and the slave drive will have ID number 1. The PATA port
|
||||||
|
nodes will be named "ide-port".
|
||||||
|
type: object
|
||||||
|
|
||||||
|
properties:
|
||||||
|
reg:
|
||||||
|
minimum: 0
|
||||||
|
maximum: 1
|
||||||
|
description:
|
||||||
|
The ID number of the drive port, 0 for the master port and 1 for the
|
||||||
|
slave port.
|
||||||
|
|
||||||
|
...
|
50
Documentation/devicetree/bindings/ata/sata-common.yaml
Normal file
50
Documentation/devicetree/bindings/ata/sata-common.yaml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/ata/sata-common.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Common Properties for Serial AT attachment (SATA) controllers
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Linus Walleij <linus.walleij@linaro.org>
|
||||||
|
|
||||||
|
description: |
|
||||||
|
This document defines device tree properties common to most Serial
|
||||||
|
AT attachment (SATA) storage devices. It doesn't constitute a device tree
|
||||||
|
binding specification by itself but is meant to be referenced by device
|
||||||
|
tree bindings.
|
||||||
|
|
||||||
|
The SATA controller-specific device tree bindings are responsible for
|
||||||
|
defining whether each property is required or optional.
|
||||||
|
|
||||||
|
properties:
|
||||||
|
$nodename:
|
||||||
|
pattern: "^sata(@.*)?$"
|
||||||
|
description:
|
||||||
|
Specifies the host controller node. SATA host controller nodes are named
|
||||||
|
"sata"
|
||||||
|
|
||||||
|
"#address-cells":
|
||||||
|
const: 1
|
||||||
|
|
||||||
|
"#size-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
patternProperties:
|
||||||
|
"^sata-port@[0-9a-e]$":
|
||||||
|
description: |
|
||||||
|
DT nodes for ports connected on the SATA host. The SATA port
|
||||||
|
nodes will be named "sata-port".
|
||||||
|
type: object
|
||||||
|
|
||||||
|
properties:
|
||||||
|
reg:
|
||||||
|
minimum: 0
|
||||||
|
maximum: 14
|
||||||
|
description:
|
||||||
|
The ID number of the drive port SATA can potentially use a port
|
||||||
|
multiplier making it possible to connect up to 15 disks to a single
|
||||||
|
SATA port.
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,108 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ahb-clk.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 AHB Clock Device Tree Bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
deprecated: true
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#clock-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
enum:
|
||||||
|
- allwinner,sun4i-a10-ahb-clk
|
||||||
|
- allwinner,sun6i-a31-ahb1-clk
|
||||||
|
- allwinner,sun8i-h3-ahb2-clk
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
minItems: 1
|
||||||
|
maxItems: 4
|
||||||
|
description: >
|
||||||
|
The parent order must match the hardware programming order.
|
||||||
|
|
||||||
|
clock-output-names:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#clock-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- clock-output-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
allOf:
|
||||||
|
- if:
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
contains:
|
||||||
|
const: allwinner,sun4i-a10-ahb-clk
|
||||||
|
|
||||||
|
then:
|
||||||
|
properties:
|
||||||
|
clocks:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
- if:
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
contains:
|
||||||
|
const: allwinner,sun6i-a31-ahb1-clk
|
||||||
|
|
||||||
|
then:
|
||||||
|
properties:
|
||||||
|
clocks:
|
||||||
|
maxItems: 4
|
||||||
|
|
||||||
|
- if:
|
||||||
|
properties:
|
||||||
|
compatible:
|
||||||
|
contains:
|
||||||
|
const: allwinner,sun8i-h3-ahb2-clk
|
||||||
|
|
||||||
|
then:
|
||||||
|
properties:
|
||||||
|
clocks:
|
||||||
|
maxItems: 2
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
ahb@1c20054 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun4i-a10-ahb-clk";
|
||||||
|
reg = <0x01c20054 0x4>;
|
||||||
|
clocks = <&axi>;
|
||||||
|
clock-output-names = "ahb";
|
||||||
|
};
|
||||||
|
|
||||||
|
- |
|
||||||
|
ahb1@1c20054 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun6i-a31-ahb1-clk";
|
||||||
|
reg = <0x01c20054 0x4>;
|
||||||
|
clocks = <&osc32k>, <&osc24M>, <&axi>, <&pll6 0>;
|
||||||
|
clock-output-names = "ahb1";
|
||||||
|
};
|
||||||
|
|
||||||
|
- |
|
||||||
|
ahb2_clk@1c2005c {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun8i-h3-ahb2-clk";
|
||||||
|
reg = <0x01c2005c 0x4>;
|
||||||
|
clocks = <&ahb1>, <&pll6d2>;
|
||||||
|
clock-output-names = "ahb2";
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,50 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb0-clk.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 APB0 Bus Clock Device Tree Bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
deprecated: true
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#clock-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
const: allwinner,sun4i-a10-apb0-clk
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clock-output-names:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#clock-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- clock-output-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
apb0@1c20054 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun4i-a10-apb0-clk";
|
||||||
|
reg = <0x01c20054 0x4>;
|
||||||
|
clocks = <&ahb>;
|
||||||
|
clock-output-names = "apb0";
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,52 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb1-clk.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 APB1 Bus Clock Device Tree Bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
deprecated: true
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#clock-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
const: allwinner,sun4i-a10-apb1-clk
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 3
|
||||||
|
description: >
|
||||||
|
The parent order must match the hardware programming order.
|
||||||
|
|
||||||
|
clock-output-names:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#clock-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- clock-output-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
clk@1c20058 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun4i-a10-apb1-clk";
|
||||||
|
reg = <0x01c20058 0x4>;
|
||||||
|
clocks = <&osc24M>, <&pll6 1>, <&osc32k>;
|
||||||
|
clock-output-names = "apb1";
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,61 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-axi-clk.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 AXI Clock Device Tree Bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
deprecated: true
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#clock-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
enum:
|
||||||
|
- allwinner,sun4i-a10-axi-clk
|
||||||
|
- allwinner,sun8i-a23-axi-clk
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clock-output-names:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#clock-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- clock-output-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
axi@1c20054 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun4i-a10-axi-clk";
|
||||||
|
reg = <0x01c20054 0x4>;
|
||||||
|
clocks = <&cpu>;
|
||||||
|
clock-output-names = "axi";
|
||||||
|
};
|
||||||
|
|
||||||
|
- |
|
||||||
|
axi_clk@1c20050 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun8i-a23-axi-clk";
|
||||||
|
reg = <0x01c20050 0x4>;
|
||||||
|
clocks = <&cpu>;
|
||||||
|
clock-output-names = "axi";
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
@@ -0,0 +1,52 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
%YAML 1.2
|
||||||
|
---
|
||||||
|
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-cpu-clk.yaml#
|
||||||
|
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||||
|
|
||||||
|
title: Allwinner A10 CPU Clock Device Tree Bindings
|
||||||
|
|
||||||
|
maintainers:
|
||||||
|
- Chen-Yu Tsai <wens@csie.org>
|
||||||
|
- Maxime Ripard <mripard@kernel.org>
|
||||||
|
|
||||||
|
deprecated: true
|
||||||
|
|
||||||
|
properties:
|
||||||
|
"#clock-cells":
|
||||||
|
const: 0
|
||||||
|
|
||||||
|
compatible:
|
||||||
|
const: allwinner,sun4i-a10-cpu-clk
|
||||||
|
|
||||||
|
reg:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
clocks:
|
||||||
|
maxItems: 4
|
||||||
|
description: >
|
||||||
|
The parent order must match the hardware programming order.
|
||||||
|
|
||||||
|
clock-output-names:
|
||||||
|
maxItems: 1
|
||||||
|
|
||||||
|
required:
|
||||||
|
- "#clock-cells"
|
||||||
|
- compatible
|
||||||
|
- reg
|
||||||
|
- clocks
|
||||||
|
- clock-output-names
|
||||||
|
|
||||||
|
additionalProperties: false
|
||||||
|
|
||||||
|
examples:
|
||||||
|
- |
|
||||||
|
cpu@1c20054 {
|
||||||
|
#clock-cells = <0>;
|
||||||
|
compatible = "allwinner,sun4i-a10-cpu-clk";
|
||||||
|
reg = <0x01c20054 0x4>;
|
||||||
|
clocks = <&osc32k>, <&osc24M>, <&pll1>, <&dummy>;
|
||||||
|
clock-output-names = "cpu";
|
||||||
|
};
|
||||||
|
|
||||||
|
...
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user