mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 20:51:03 +02:00
Merge branch 'for-5.7/appleir' into for-linus
- small code cleanups in hid-appleir from Lucas Tanure
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -100,6 +100,10 @@ modules.order
|
||||
/include/ksym/
|
||||
/arch/*/include/generated/
|
||||
|
||||
# Generated lkdtm tests
|
||||
/tools/testing/selftests/lkdtm/*.sh
|
||||
!/tools/testing/selftests/lkdtm/run.sh
|
||||
|
||||
# stgit generated dirs
|
||||
patches-*
|
||||
|
||||
|
11
.mailmap
11
.mailmap
@@ -18,6 +18,7 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
||||
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
||||
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
||||
Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||
@@ -27,6 +28,8 @@ Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
||||
Andreas Herrmann <aherrman@de.ibm.com>
|
||||
Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
|
||||
Andrew Morton <akpm@linux-foundation.org>
|
||||
Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
|
||||
Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
|
||||
Andrew Vasquez <andrew.vasquez@qlogic.com>
|
||||
Andy Adamson <andros@citi.umich.edu>
|
||||
Antoine Tenart <antoine.tenart@free-electrons.com>
|
||||
@@ -74,6 +77,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> <dima@arista.com>
|
||||
Domen Puncer <domen@coderock.org>
|
||||
Douglas Gilbert <dougg@torque.net>
|
||||
Ed L. Cashin <ecashin@coraid.com>
|
||||
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
|
||||
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
|
||||
Felipe W Damasio <felipewd@terra.com.br>
|
||||
Felix Kuhling <fxkuehl@gmx.de>
|
||||
@@ -138,6 +142,7 @@ Juha Yrjola <at solidboot.com>
|
||||
Juha Yrjola <juha.yrjola@nokia.com>
|
||||
Juha Yrjola <juha.yrjola@solidboot.com>
|
||||
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
|
||||
Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
|
||||
Kay Sievers <kay.sievers@vrfy.org>
|
||||
Kenneth W Chen <kenneth.w.chen@intel.com>
|
||||
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
|
||||
@@ -209,6 +214,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
||||
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
|
||||
Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
|
||||
Peter A Jonsson <pj@ludd.ltu.se>
|
||||
Peter Oruba <peter@oruba.de>
|
||||
Peter Oruba <peter.oruba@amd.com>
|
||||
@@ -217,6 +226,7 @@ Praveen BP <praveenbp@ti.com>
|
||||
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
||||
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
||||
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
||||
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
|
||||
Rajesh Shah <rajesh.shah@intel.com>
|
||||
Ralf Baechle <ralf@linux-mips.org>
|
||||
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
|
||||
@@ -252,6 +262,7 @@ Sumit Semwal <sumit.semwal@ti.com>
|
||||
Tejun Heo <htejun@gmail.com>
|
||||
Thomas Graf <tgraf@suug.ch>
|
||||
Thomas Pedersen <twp@codeaurora.org>
|
||||
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
|
||||
Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org>
|
||||
Tony Luck <tony.luck@intel.com>
|
||||
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
||||
|
2
COPYING
2
COPYING
@@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see:
|
||||
Documentation/process/license-rules.rst
|
||||
|
||||
for more details.
|
||||
|
||||
All contributions to the Linux Kernel are subject to this COPYING file.
|
||||
|
9
CREDITS
9
CREDITS
@@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem
|
||||
S: Orlando, Florida
|
||||
S: USA
|
||||
|
||||
N: Paul Burton
|
||||
E: paulburton@kernel.org
|
||||
W: https://pburton.com
|
||||
D: MIPS maintainer 2018-2020
|
||||
|
||||
N: Lennert Buytenhek
|
||||
E: kernel@wantstofly.org
|
||||
D: Original (2.4) rewrite of the ethernet bridging code
|
||||
@@ -3302,7 +3307,9 @@ S: France
|
||||
N: Aleksa Sarai
|
||||
E: cyphar@cyphar.com
|
||||
W: https://www.cyphar.com/
|
||||
D: `pids` cgroup subsystem
|
||||
D: /sys/fs/cgroup/pids
|
||||
D: openat2(2)
|
||||
S: Sydney, Australia
|
||||
|
||||
N: Dipankar Sarma
|
||||
E: dipankar@in.ibm.com
|
||||
|
26
Documentation/ABI/obsolete/sysfs-selinux-disable
Normal file
26
Documentation/ABI/obsolete/sysfs-selinux-disable
Normal file
@@ -0,0 +1,26 @@
|
||||
What: /sys/fs/selinux/disable
|
||||
Date: April 2005 (predates git)
|
||||
KernelVersion: 2.6.12-rc2 (predates git)
|
||||
Contact: selinux@vger.kernel.org
|
||||
Description:
|
||||
|
||||
The selinuxfs "disable" node allows SELinux to be disabled at runtime
|
||||
prior to a policy being loaded into the kernel. If disabled via this
|
||||
mechanism, SELinux will remain disabled until the system is rebooted.
|
||||
|
||||
The preferred method of disabling SELinux is via the "selinux=0" boot
|
||||
parameter, but the selinuxfs "disable" node was created to make it
|
||||
easier for systems with primitive bootloaders that did not allow for
|
||||
easy modification of the kernel command line. Unfortunately, allowing
|
||||
for SELinux to be disabled at runtime makes it difficult to secure the
|
||||
kernel's LSM hooks using the "__ro_after_init" feature.
|
||||
|
||||
Thankfully, the need for the SELinux runtime disable appears to be
|
||||
gone, the default Kconfig configuration disables this selinuxfs node,
|
||||
and only one of the major distributions, Fedora, supports disabling
|
||||
SELinux at runtime. Fedora is in the process of removing the
|
||||
selinuxfs "disable" node and once that is complete we will start the
|
||||
slow process of removing this code from the kernel.
|
||||
|
||||
More information on /sys/fs/selinux/disable can be found under the
|
||||
CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.
|
171
Documentation/ABI/stable/sysfs-driver-dma-idxd
Normal file
171
Documentation/ABI/stable/sysfs-driver-dma-idxd
Normal file
@@ -0,0 +1,171 @@
|
||||
What: sys/bus/dsa/devices/dsa<m>/cdev_major
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The major number that the character device driver assigned to
|
||||
this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/errors
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The error information for this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_batch_size
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The largest number of work descriptors in a batch.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_work_queues_size
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The maximum work queue size supported by this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_engines
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The maximum number of engines supported by this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_groups
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The maximum number of groups can be created under this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_tokens
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The total number of bandwidth tokens supported by this device.
|
||||
The bandwidth tokens represent resources within the DSA
|
||||
implementation, and these resources are allocated by engines to
|
||||
support operations.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_transfer_size
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The number of bytes to be read from the source address to
|
||||
perform the operation. The maximum transfer size is dependent on
|
||||
the workqueue the descriptor was submitted to.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/max_work_queues
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The maximum work queue number that this device supports.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/numa_node
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The numa node number for this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/op_cap
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The operation capability bit mask specify the operation types
|
||||
supported by the this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/state
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The state information of this device. It can be either enabled
|
||||
or disabled.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/group<m>.<n>
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The assigned group under this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The assigned engine under this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The assigned work queue under this device.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/configurable
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: To indicate if this device is configurable or not.
|
||||
|
||||
What: sys/bus/dsa/devices/dsa<m>/token_limit
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The maximum number of bandwidth tokens that may be in use at
|
||||
one time by operations that access low bandwidth memory in the
|
||||
device.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/group_id
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The group id that this work queue belongs to.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/size
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The work queue size for this work queue.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/type
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The type of this work queue, it can be "kernel" type for work
|
||||
queue usages in the kernel space or "user" type for work queue
|
||||
usages by applications in user space.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The minor number assigned to this work queue by the character
|
||||
device driver.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/mode
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The work queue mode type for this work queue.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/priority
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The priority value of this work queue, it is a vlue relative to
|
||||
other work queue in the same group to control quality of service
|
||||
for dispatching work from multiple workqueues in the same group.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/state
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The current state of the work queue.
|
||||
|
||||
What: sys/bus/dsa/devices/wq<m>.<n>/threshold
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The number of entries in this work queue that may be filled
|
||||
via a limited portal.
|
||||
|
||||
What: sys/bus/dsa/devices/engine<m>.<n>/group_id
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The group that this engine belongs to.
|
@@ -16,6 +16,10 @@ Description:
|
||||
write UDC's name found in /sys/class/udc/*
|
||||
to bind a gadget, empty string "" to unbind.
|
||||
|
||||
max_speed - maximum speed the driver supports. Valid
|
||||
names are super-speed-plus, super-speed,
|
||||
high-speed, full-speed, and low-speed.
|
||||
|
||||
bDeviceClass - USB device class code
|
||||
bDeviceSubClass - USB device subclass code
|
||||
bDeviceProtocol - USB device protocol code
|
||||
|
@@ -25,11 +25,11 @@ Description:
|
||||
lsm: [[subj_user=] [subj_role=] [subj_type=]
|
||||
[obj_user=] [obj_role=] [obj_type=]]
|
||||
option: [[appraise_type=]] [template=] [permit_directio]
|
||||
[appraise_flag=]
|
||||
[appraise_flag=] [keyrings=]
|
||||
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
|
||||
[FIRMWARE_CHECK]
|
||||
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
||||
[KEXEC_CMDLINE]
|
||||
[KEXEC_CMDLINE] [KEY_CHECK]
|
||||
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
|
||||
[[^]MAY_EXEC]
|
||||
fsmagic:= hex value
|
||||
@@ -42,6 +42,9 @@ Description:
|
||||
appraise_flag:= [check_blacklist]
|
||||
Currently, blacklist check is only for files signed with appended
|
||||
signature.
|
||||
keyrings:= list of keyrings
|
||||
(eg, .builtin_trusted_keys|.ima). Only valid
|
||||
when action is "measure" and func is KEY_CHECK.
|
||||
template:= name of a defined IMA template type
|
||||
(eg, ima-ng). Only valid when action is "measure".
|
||||
pcr:= decimal value
|
||||
@@ -113,3 +116,12 @@ Description:
|
||||
Example of appraise rule allowing modsig appended signatures:
|
||||
|
||||
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
|
||||
|
||||
Example of measure rule using KEY_CHECK to measure all keys:
|
||||
|
||||
measure func=KEY_CHECK
|
||||
|
||||
Example of measure rule using KEY_CHECK to only measure
|
||||
keys added to .builtin_trusted_keys or .ima keyring:
|
||||
|
||||
measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
|
||||
|
@@ -33,6 +33,14 @@ Description:
|
||||
Requires a separate RTC_PIE_ON call to enable the periodic
|
||||
interrupts.
|
||||
|
||||
* RTC_VL_READ: Read the voltage inputs status of the RTC when
|
||||
supported. The value is a bit field of RTC_VL_*, giving the
|
||||
status of the main and backup voltages.
|
||||
|
||||
* RTC_VL_CLEAR: Clear the voltage status of the RTC. Some RTCs
|
||||
need user interaction when the backup power provider is
|
||||
replaced or charged to be able to clear the status.
|
||||
|
||||
The ioctl() calls supported by the older /dev/rtc interface are
|
||||
also supported by the newer RTC class framework. However,
|
||||
because the chips and systems are not standardized, some PC/AT
|
||||
|
@@ -1726,3 +1726,16 @@ Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
List of valid periods (in seconds) for which the light intensity
|
||||
must be above the threshold level before interrupt is asserted.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_filter_notch_center_frequency
|
||||
KernelVersion: 5.5
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Center frequency in Hz for a notch filter. Used i.e. for line
|
||||
noise suppression.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_temp_thermocouple_type
|
||||
KernelVersion: 5.5
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
One of the following thermocouple types: B, E, J, K, N, R, S, T.
|
||||
|
19
Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
Normal file
19
Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
Normal file
@@ -0,0 +1,19 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/buffer/length_align_bytes
|
||||
KernelVersion: 5.4
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
DMA buffers tend to have a alignment requirement for the
|
||||
buffers. If this alignment requirement is not met samples might
|
||||
be dropped from the buffer.
|
||||
|
||||
This property reports the alignment requirements in bytes.
|
||||
This means that the buffer size in bytes needs to be a integer
|
||||
multiple of the number reported by this file.
|
||||
|
||||
The alignment requirements in number of sample sets will depend
|
||||
on the enabled channels and the bytes per channel. This means
|
||||
that the alignment requirement in samples sets might change
|
||||
depending on which and how many channels are enabled. Whereas
|
||||
the alignment requirement reported in bytes by this property
|
||||
will remain static and does not depend on which channels are
|
||||
enabled.
|
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
63
Documentation/ABI/testing/sysfs-bus-mdio
Normal file
@@ -0,0 +1,63 @@
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
This folder contains statistics about global and per
|
||||
MDIO bus address statistics.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/transfers
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfers for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/errors
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfer errors for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/writes
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of write transactions for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/reads
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of read transactions for this MDIO bus.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfers for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of transfer errors for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of write transactions for this MDIO bus address.
|
||||
|
||||
What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
|
||||
Date: January 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Total number of read transactions for this MDIO bus address.
|
@@ -7,6 +7,13 @@ Description:
|
||||
The name of devfreq object denoted as ... is same as the
|
||||
name of device using devfreq.
|
||||
|
||||
What: /sys/class/devfreq/.../name
|
||||
Date: November 2019
|
||||
Contact: Chanwoo Choi <cw00.choi@samsung.com>
|
||||
Description:
|
||||
The /sys/class/devfreq/.../name shows the name of device
|
||||
of the corresponding devfreq object.
|
||||
|
||||
What: /sys/class/devfreq/.../governor
|
||||
Date: September 2011
|
||||
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
||||
@@ -48,12 +55,15 @@ What: /sys/class/devfreq/.../trans_stat
|
||||
Date: October 2012
|
||||
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
|
||||
Description:
|
||||
This ABI shows the statistics of devfreq behavior on a
|
||||
specific device. It shows the time spent in each state and
|
||||
the number of transitions between states.
|
||||
This ABI shows or clears the statistics of devfreq behavior
|
||||
on a specific device. It shows the time spent in each state
|
||||
and the number of transitions between states.
|
||||
In order to activate this ABI, the devfreq target device
|
||||
driver should provide the list of available frequencies
|
||||
with its profile.
|
||||
with its profile. If need to reset the statistics of devfreq
|
||||
behavior on a specific device, enter 0(zero) to 'trans_stat'
|
||||
as following:
|
||||
echo 0 > /sys/class/devfreq/.../trans_stat
|
||||
|
||||
What: /sys/class/devfreq/.../userspace/set_freq
|
||||
Date: September 2011
|
||||
|
@@ -189,7 +189,8 @@ Description:
|
||||
Access: Read
|
||||
Valid values: "Unknown", "Good", "Overheat", "Dead",
|
||||
"Over voltage", "Unspecified failure", "Cold",
|
||||
"Watchdog timer expire", "Safety timer expire"
|
||||
"Watchdog timer expire", "Safety timer expire",
|
||||
"Over current"
|
||||
|
||||
What: /sys/class/power_supply/<supply_name>/precharge_current
|
||||
Date: June 2017
|
||||
|
@@ -196,6 +196,12 @@ Description:
|
||||
does not reflect it. Likewise, if one enables a deep state but a
|
||||
lighter state still is disabled, then this has no effect.
|
||||
|
||||
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
|
||||
Date: December 2019
|
||||
KernelVersion: v5.6
|
||||
Contact: Linux power management list <linux-pm@vger.kernel.org>
|
||||
Description:
|
||||
(RO) The default status of this state, "enabled" or "disabled".
|
||||
|
||||
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
|
||||
Date: March 2014
|
||||
|
@@ -11,3 +11,16 @@ Description:
|
||||
#echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks
|
||||
will allow the guest to read and write to the configuration
|
||||
register 0x0E.
|
||||
|
||||
What: /sys/bus/pci/drivers/pciback/allow_interrupt_control
|
||||
Date: Jan 2020
|
||||
KernelVersion: 5.6
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description:
|
||||
List of devices which can have interrupt control flag (INTx,
|
||||
MSI, MSI-X) set by a connected guest. It is meant to be set
|
||||
only when the guest is a stubdomain hosting device model (qemu)
|
||||
and the actual device is assigned to a HVM. It is not safe
|
||||
(similar to permissive attribute) to set for a devices assigned
|
||||
to a PV guest. The device is automatically removed from this
|
||||
list when the connected pcifront terminates.
|
||||
|
@@ -25,3 +25,13 @@ Description:
|
||||
allocated without being in use. The time is in
|
||||
seconds, 0 means indefinitely long.
|
||||
The default is 60 seconds.
|
||||
|
||||
What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
|
||||
Date: December 2019
|
||||
KernelVersion: 5.6
|
||||
Contact: SeongJae Park <sjpark@amazon.de>
|
||||
Description:
|
||||
When memory pressure is reported to blkback this option
|
||||
controls the duration in milliseconds that blkback will not
|
||||
cache any page not backed by a grant mapping.
|
||||
The default is 10ms.
|
||||
|
@@ -1,37 +1,40 @@
|
||||
What: /sys/fs/f2fs/<disk>/gc_max_sleep_time
|
||||
Date: July 2013
|
||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||
Description:
|
||||
Controls the maximun sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
Description: Controls the maximum sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_min_sleep_time
|
||||
Date: July 2013
|
||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||
Description:
|
||||
Controls the minimum sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
Description: Controls the minimum sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time
|
||||
Date: July 2013
|
||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||
Description:
|
||||
Controls the default sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
Description: Controls the default sleep time for gc_thread. Time
|
||||
is in milliseconds.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_idle
|
||||
Date: July 2013
|
||||
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
|
||||
Description:
|
||||
Controls the victim selection policy for garbage collection.
|
||||
Description: Controls the victim selection policy for garbage collection.
|
||||
Setting gc_idle = 0(default) will disable this option. Setting
|
||||
gc_idle = 1 will select the Cost Benefit approach & setting
|
||||
gc_idle = 2 will select the greedy approach.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/reclaim_segments
|
||||
Date: October 2013
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the issue rate of segment discard commands.
|
||||
Description: This parameter controls the number of prefree segments to be
|
||||
reclaimed. If the number of prefree segments is larger than
|
||||
the number of segments in the proportion to the percentage
|
||||
over total volume size, f2fs tries to conduct checkpoint to
|
||||
reclaim the prefree segments to free segments.
|
||||
By default, 5% over total # of segments.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_blkaddr
|
||||
What: /sys/fs/f2fs/<disk>/main_blkaddr
|
||||
Date: November 2019
|
||||
Contact: "Ramon Pantin" <pantin@google.com>
|
||||
Description:
|
||||
@@ -40,227 +43,278 @@ Description:
|
||||
What: /sys/fs/f2fs/<disk>/ipu_policy
|
||||
Date: November 2013
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the in-place-update policy.
|
||||
Description: Controls the in-place-update policy.
|
||||
updates in f2fs. User can set:
|
||||
0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
|
||||
0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
|
||||
0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
|
||||
0x40: F2FS_IPU_NOCACHE.
|
||||
Refer segment.h for details.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_ipu_util
|
||||
Date: November 2013
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the FS utilization condition for the in-place-update
|
||||
policies.
|
||||
Description: Controls the FS utilization condition for the in-place-update
|
||||
policies. It is used by F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_fsync_blocks
|
||||
Date: September 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the dirty page count condition for the in-place-update
|
||||
policies.
|
||||
Description: Controls the dirty page count condition for the in-place-update
|
||||
policies.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_seq_blocks
|
||||
Date: August 2018
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the dirty page count condition for batched sequential
|
||||
writes in ->writepages.
|
||||
|
||||
Description: Controls the dirty page count condition for batched sequential
|
||||
writes in writepages.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_hot_blocks
|
||||
Date: March 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the dirty page count condition for redefining hot data.
|
||||
Description: Controls the dirty page count condition for redefining hot data.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/min_ssr_sections
|
||||
Date: October 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls the fee section threshold to trigger SSR allocation.
|
||||
Description: Controls the free section threshold to trigger SSR allocation.
|
||||
If this is large, SSR mode will be enabled early.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_small_discards
|
||||
Date: November 2013
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the issue rate of small discard commands.
|
||||
Description: Controls the issue rate of discard commands that consist of small
|
||||
blocks less than 2MB. The candidates to be discarded are cached until
|
||||
checkpoint is triggered, and issued during the checkpoint.
|
||||
By default, it is disabled with 0.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_granularity
|
||||
Date: July 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls discard granularity of inner discard thread, inner thread
|
||||
What: /sys/fs/f2fs/<disk>/discard_granularity
|
||||
Date: July 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description: Controls discard granularity of inner discard thread. Inner thread
|
||||
will not issue discards with size that is smaller than granularity.
|
||||
The unit size is one block, now only support configuring in range
|
||||
of [1, 512].
|
||||
The unit size is one block(4KB), now only support configuring
|
||||
in range of [1, 512]. Default value is 4(=16KB).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the number of trials to find a victim segment.
|
||||
Description: Controls the number of trials to find a victim segment
|
||||
when conducting SSR and cleaning operations. The default value
|
||||
is 4096 which covers 8GB block address range.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/migration_granularity
|
||||
Date: October 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls migration granularity of garbage collection on large
|
||||
section, it can let GC move partial segment{s} of one section
|
||||
in one GC cycle, so that dispersing heavy overhead GC to
|
||||
multiple lightweight one.
|
||||
Description: Controls migration granularity of garbage collection on large
|
||||
section, it can let GC move partial segment{s} of one section
|
||||
in one GC cycle, so that dispersing heavy overhead GC to
|
||||
multiple lightweight one.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dir_level
|
||||
Date: March 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the directory level for large directory.
|
||||
Description: Controls the directory level for large directory. If a
|
||||
directory has a number of files, it can reduce the file lookup
|
||||
latency by increasing this dir_level value. Otherwise, it
|
||||
needs to decrease this value to reduce the space overhead.
|
||||
The default value is 0.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ram_thresh
|
||||
Date: March 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the memory footprint used by f2fs.
|
||||
Description: Controls the memory footprint used by free nids and cached
|
||||
nat entries. By default, 1 is set, which indicates
|
||||
10 MB / 1 GB RAM.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/batched_trim_sections
|
||||
Date: February 2015
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the trimming rate in batch mode.
|
||||
<deprecated>
|
||||
Description: Controls the trimming rate in batch mode.
|
||||
<deprecated>
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/cp_interval
|
||||
Date: October 2015
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the checkpoint timing.
|
||||
Description: Controls the checkpoint timing, set to 60 seconds by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/idle_interval
|
||||
Date: January 2016
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the idle timing for all paths other than
|
||||
discard and gc path.
|
||||
Description: Controls the idle timing of system, if there is no FS operation
|
||||
during given interval.
|
||||
Set to 5 seconds by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for discard path.
|
||||
Description: Controls the idle timing of discard thread given
|
||||
this time interval.
|
||||
Default is 5 secs.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for gc path.
|
||||
Description: Controls the idle timing for gc path. Set to 5 seconds by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/iostat_enable
|
||||
Date: August 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls to enable/disable IO stat.
|
||||
Description: Controls to enable/disable IO stat.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ra_nid_pages
|
||||
Date: October 2015
|
||||
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
||||
Description:
|
||||
Controls the count of nid pages to be readaheaded.
|
||||
Description: Controls the count of nid pages to be readaheaded.
|
||||
When building free nids, F2FS reads NAT blocks ahead for
|
||||
speed up. Default is 0.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dirty_nats_ratio
|
||||
Date: January 2016
|
||||
Contact: "Chao Yu" <chao2.yu@samsung.com>
|
||||
Description:
|
||||
Controls dirty nat entries ratio threshold, if current
|
||||
ratio exceeds configured threshold, checkpoint will
|
||||
be triggered for flushing dirty nat entries.
|
||||
Description: Controls dirty nat entries ratio threshold, if current
|
||||
ratio exceeds configured threshold, checkpoint will
|
||||
be triggered for flushing dirty nat entries.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes
|
||||
Date: January 2016
|
||||
Contact: "Shuoran Liu" <liushuoran@huawei.com>
|
||||
Description:
|
||||
Shows total written kbytes issued to disk.
|
||||
Description: Shows total written kbytes issued to disk.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/features
|
||||
Date: July 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Shows all enabled features in current device.
|
||||
Description: Shows all enabled features in current device.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_rate
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection rate.
|
||||
Description: Controls the injection rate of arbitrary faults.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/inject_type
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls the injection type.
|
||||
Description: Controls the injection type of arbitrary faults.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dirty_segments
|
||||
Date: October 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description: Shows the number of dirty segments.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/reserved_blocks
|
||||
Date: June 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls target reserved blocks in system, the threshold
|
||||
is soft, it could exceed current available user space.
|
||||
Description: Controls target reserved blocks in system, the threshold
|
||||
is soft, it could exceed current available user space.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/current_reserved_blocks
|
||||
Date: October 2017
|
||||
Contact: "Yunlong Song" <yunlong.song@huawei.com>
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Shows current reserved blocks in system, it may be temporarily
|
||||
smaller than target_reserved_blocks, but will gradually
|
||||
increase to target_reserved_blocks when more free blocks are
|
||||
freed by user later.
|
||||
Description: Shows current reserved blocks in system, it may be temporarily
|
||||
smaller than target_reserved_blocks, but will gradually
|
||||
increase to target_reserved_blocks when more free blocks are
|
||||
freed by user later.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent
|
||||
Date: August 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Do background GC agressively
|
||||
Description: Do background GC agressively when set. When gc_urgent = 1,
|
||||
background thread starts to do GC by given gc_urgent_sleep_time
|
||||
interval. It is set to 0 by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
|
||||
Date: August 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls sleep time of GC urgent mode
|
||||
Description: Controls sleep time of GC urgent mode. Set to 500ms by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/readdir_ra
|
||||
Date: November 2017
|
||||
Contact: "Sheng Yong" <shengyong1@huawei.com>
|
||||
Description:
|
||||
Controls readahead inode block in readdir.
|
||||
Description: Controls readahead inode block in readdir. Enabled by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_pin_file_thresh
|
||||
Date: January 2018
|
||||
Contact: Jaegeuk Kim <jaegeuk@kernel.org>
|
||||
Description: This indicates how many GC can be failed for the pinned
|
||||
file. If it exceeds this, F2FS doesn't guarantee its pinning
|
||||
state. 2048 trials is set by default.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/extension_list
|
||||
Date: Feburary 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Used to control configure extension list:
|
||||
- Query: cat /sys/fs/f2fs/<disk>/extension_list
|
||||
- Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||
- [h] means add/del hot file extension
|
||||
- [c] means add/del cold file extension
|
||||
Description: Used to control configure extension list:
|
||||
- Query: cat /sys/fs/f2fs/<disk>/extension_list
|
||||
- Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
|
||||
- [h] means add/del hot file extension
|
||||
- [c] means add/del cold file extension
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/unusable
|
||||
Date April 2019
|
||||
Contact: "Daniel Rosenberg" <drosen@google.com>
|
||||
Description:
|
||||
If checkpoint=disable, it displays the number of blocks that are unusable.
|
||||
If checkpoint=enable it displays the enumber of blocks that would be unusable
|
||||
if checkpoint=disable were to be set.
|
||||
Description: If checkpoint=disable, it displays the number of blocks that
|
||||
are unusable.
|
||||
If checkpoint=enable it displays the enumber of blocks that
|
||||
would be unusable if checkpoint=disable were to be set.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/encoding
|
||||
Date July 2019
|
||||
Contact: "Daniel Rosenberg" <drosen@google.com>
|
||||
Description:
|
||||
Displays name and version of the encoding set for the filesystem.
|
||||
If no encoding is set, displays (none)
|
||||
Description: Displays name and version of the encoding set for the filesystem.
|
||||
If no encoding is set, displays (none)
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/free_segments
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of free segments in disk.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/cp_foreground_calls
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of checkpoint operations performed on demand. Available when
|
||||
CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/cp_background_calls
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of checkpoint operations performed in the background to
|
||||
free segments. Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_foreground_calls
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of garbage collection operations performed on demand.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_background_calls
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of garbage collection operations triggered in background.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/moved_blocks_foreground
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of blocks moved by garbage collection in foreground.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/moved_blocks_background
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Number of blocks moved by garbage collection in background.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/avg_vblocks
|
||||
Date: September 2019
|
||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||
Description: Average number of valid blocks.
|
||||
Available when CONFIG_F2FS_STAT_FS=y.
|
||||
|
@@ -407,3 +407,16 @@ Contact: Kalesh Singh <kaleshsingh96@gmail.com>
|
||||
Description:
|
||||
The /sys/power/suspend_stats/last_failed_step file contains
|
||||
the last failed step in the suspend/resume path.
|
||||
|
||||
What: /sys/power/sync_on_suspend
|
||||
Date: October 2019
|
||||
Contact: Jonas Meurer <jonas@freesources.org>
|
||||
Description:
|
||||
This file controls whether or not the kernel will sync()
|
||||
filesystems during system suspend (after freezing user space
|
||||
and before suspending devices).
|
||||
|
||||
Writing a "1" to this file enables the sync() and writing a "0"
|
||||
disables it. Reads from the file return the current value.
|
||||
The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config
|
||||
flag is unset, or "0" otherwise.
|
||||
|
46
Documentation/ABI/testing/usb-charger-uevent
Normal file
46
Documentation/ABI/testing/usb-charger-uevent
Normal file
@@ -0,0 +1,46 @@
|
||||
What: Raise a uevent when a USB charger is inserted or removed
|
||||
Date: 2020-01-14
|
||||
KernelVersion: 5.6
|
||||
Contact: linux-usb@vger.kernel.org
|
||||
Description: There are two USB charger states:
|
||||
USB_CHARGER_ABSENT
|
||||
USB_CHARGER_PRESENT
|
||||
There are five USB charger types:
|
||||
USB_CHARGER_UNKNOWN_TYPE: Charger type is unknown
|
||||
USB_CHARGER_SDP_TYPE: Standard Downstream Port
|
||||
USB_CHARGER_CDP_TYPE: Charging Downstream Port
|
||||
USB_CHARGER_DCP_TYPE: Dedicated Charging Port
|
||||
USB_CHARGER_ACA_TYPE: Accessory Charging Adapter
|
||||
https://www.usb.org/document-library/battery-charging-v12-spec-and-adopters-agreement
|
||||
|
||||
Here are two examples taken using udevadm monitor -p when
|
||||
USB charger is online:
|
||||
UDEV change /devices/soc0/usbphynop1 (platform)
|
||||
ACTION=change
|
||||
DEVPATH=/devices/soc0/usbphynop1
|
||||
DRIVER=usb_phy_generic
|
||||
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
|
||||
OF_COMPATIBLE_0=usb-nop-xceiv
|
||||
OF_COMPATIBLE_N=1
|
||||
OF_FULLNAME=/usbphynop1
|
||||
OF_NAME=usbphynop1
|
||||
SEQNUM=2493
|
||||
SUBSYSTEM=platform
|
||||
USB_CHARGER_STATE=USB_CHARGER_PRESENT
|
||||
USB_CHARGER_TYPE=USB_CHARGER_SDP_TYPE
|
||||
USEC_INITIALIZED=227422826
|
||||
|
||||
USB charger is offline:
|
||||
KERNEL change /devices/soc0/usbphynop1 (platform)
|
||||
ACTION=change
|
||||
DEVPATH=/devices/soc0/usbphynop1
|
||||
DRIVER=usb_phy_generic
|
||||
MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
|
||||
OF_COMPATIBLE_0=usb-nop-xceiv
|
||||
OF_COMPATIBLE_N=1
|
||||
OF_FULLNAME=/usbphynop1
|
||||
OF_NAME=usbphynop1
|
||||
SEQNUM=2494
|
||||
SUBSYSTEM=platform
|
||||
USB_CHARGER_STATE=USB_CHARGER_ABSENT
|
||||
USB_CHARGER_TYPE=USB_CHARGER_UNKNOWN_TYPE
|
@@ -283,5 +283,5 @@ or disabled (0). If 0 is found in any of the msi_bus files belonging
|
||||
to bridges between the PCI root and the device, MSIs are disabled.
|
||||
|
||||
It is also worth checking the device driver to see whether it supports MSIs.
|
||||
For example, it may contain calls to pci_irq_alloc_vectors() with the
|
||||
For example, it may contain calls to pci_alloc_irq_vectors() with the
|
||||
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
|
||||
|
@@ -1,4 +1,7 @@
|
||||
.. _NMI_rcu_doc:
|
||||
|
||||
Using RCU to Protect Dynamic NMI Handlers
|
||||
=========================================
|
||||
|
||||
|
||||
Although RCU is usually used to protect read-mostly data structures,
|
||||
@@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in
|
||||
"arch/x86/kernel/traps.c".
|
||||
|
||||
The relevant pieces of code are listed below, each followed by a
|
||||
brief explanation.
|
||||
brief explanation::
|
||||
|
||||
static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
|
||||
{
|
||||
@@ -18,12 +21,12 @@ brief explanation.
|
||||
|
||||
The dummy_nmi_callback() function is a "dummy" NMI handler that does
|
||||
nothing, but returns zero, thus saying that it did nothing, allowing
|
||||
the NMI handler to take the default machine-specific action.
|
||||
the NMI handler to take the default machine-specific action::
|
||||
|
||||
static nmi_callback_t nmi_callback = dummy_nmi_callback;
|
||||
|
||||
This nmi_callback variable is a global function pointer to the current
|
||||
NMI handler.
|
||||
NMI handler::
|
||||
|
||||
void do_nmi(struct pt_regs * regs, long error_code)
|
||||
{
|
||||
@@ -53,11 +56,12 @@ anyway. However, in practice it is a good documentation aid, particularly
|
||||
for anyone attempting to do something similar on Alpha or on systems
|
||||
with aggressive optimizing compilers.
|
||||
|
||||
Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
|
||||
given that the code referenced by the pointer is read-only?
|
||||
Quick Quiz:
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
|
||||
|
||||
Back to the discussion of NMI and RCU...
|
||||
Back to the discussion of NMI and RCU::
|
||||
|
||||
void set_nmi_callback(nmi_callback_t callback)
|
||||
{
|
||||
@@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler. Note that any
|
||||
data that is to be used by the callback must be initialized up -before-
|
||||
the call to set_nmi_callback(). On architectures that do not order
|
||||
writes, the rcu_assign_pointer() ensures that the NMI handler sees the
|
||||
initialized values.
|
||||
initialized values::
|
||||
|
||||
void unset_nmi_callback(void)
|
||||
{
|
||||
@@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution
|
||||
of it completes on all other CPUs.
|
||||
|
||||
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||
follows:
|
||||
follows::
|
||||
|
||||
unset_nmi_callback();
|
||||
synchronize_rcu();
|
||||
@@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns.
|
||||
Important note: for this to work, the architecture in question must
|
||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||
|
||||
.. _answer_quick_quiz_NMI:
|
||||
|
||||
Answer to Quick Quiz
|
||||
Answer to Quick Quiz:
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
|
||||
|
||||
Why might the rcu_dereference_sched() be necessary on Alpha, given
|
||||
that the code referenced by the pointer is read-only?
|
||||
The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
|
||||
Answer: The caller to set_nmi_callback() might well have
|
||||
initialized some data that is to be used by the new NMI
|
||||
handler. In this case, the rcu_dereference_sched() would
|
||||
be needed, because otherwise a CPU that received an NMI
|
||||
just after the new handler was set might see the pointer
|
||||
to the new NMI handler, but the old pre-initialized
|
||||
version of the handler's data.
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
being protected by RCU-sched.
|
@@ -1,19 +1,21 @@
|
||||
Using RCU to Protect Read-Mostly Arrays
|
||||
.. _array_rcu_doc:
|
||||
|
||||
Using RCU to Protect Read-Mostly Arrays
|
||||
=======================================
|
||||
|
||||
Although RCU is more commonly used to protect linked lists, it can
|
||||
also be used to protect arrays. Three situations are as follows:
|
||||
|
||||
1. Hash Tables
|
||||
1. :ref:`Hash Tables <hash_tables>`
|
||||
|
||||
2. Static Arrays
|
||||
2. :ref:`Static Arrays <static_arrays>`
|
||||
|
||||
3. Resizeable Arrays
|
||||
3. :ref:`Resizable Arrays <resizable_arrays>`
|
||||
|
||||
Each of these three situations involves an RCU-protected pointer to an
|
||||
array that is separately indexed. It might be tempting to consider use
|
||||
of RCU to instead protect the index into an array, however, this use
|
||||
case is -not- supported. The problem with RCU-protected indexes into
|
||||
case is **not** supported. The problem with RCU-protected indexes into
|
||||
arrays is that compilers can play way too many optimization games with
|
||||
integers, which means that the rules governing handling of these indexes
|
||||
are far more trouble than they are worth. If RCU-protected indexes into
|
||||
@@ -24,16 +26,20 @@ to be safely used.
|
||||
That aside, each of the three RCU-protected pointer situations are
|
||||
described in the following sections.
|
||||
|
||||
.. _hash_tables:
|
||||
|
||||
Situation 1: Hash Tables
|
||||
------------------------
|
||||
|
||||
Hash tables are often implemented as an array, where each array entry
|
||||
has a linked-list hash chain. Each hash chain can be protected by RCU
|
||||
as described in the listRCU.txt document. This approach also applies
|
||||
to other array-of-list situations, such as radix trees.
|
||||
|
||||
.. _static_arrays:
|
||||
|
||||
Situation 2: Static Arrays
|
||||
--------------------------
|
||||
|
||||
Static arrays, where the data (rather than a pointer to the data) is
|
||||
located in each array element, and where the array is never resized,
|
||||
@@ -41,13 +47,17 @@ have not been used with RCU. Rik van Riel recommends using seqlock in
|
||||
this situation, which would also have minimal read-side overhead as long
|
||||
as updates are rare.
|
||||
|
||||
Quick Quiz: Why is it so important that updates be rare when
|
||||
using seqlock?
|
||||
Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
|
||||
|
||||
Situation 3: Resizeable Arrays
|
||||
.. _resizable_arrays:
|
||||
|
||||
Use of RCU for resizeable arrays is demonstrated by the grow_ary()
|
||||
Situation 3: Resizable Arrays
|
||||
------------------------------
|
||||
|
||||
Use of RCU for resizable arrays is demonstrated by the grow_ary()
|
||||
function formerly used by the System V IPC code. The array is used
|
||||
to map from semaphore, message-queue, and shared-memory IDs to the data
|
||||
structure that represents the corresponding IPC construct. The grow_ary()
|
||||
@@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to
|
||||
the new array, and invokes ipc_rcu_putref() to free up the old array.
|
||||
Note that rcu_assign_pointer() is used to update the ids->entries pointer,
|
||||
which includes any memory barriers required on whatever architecture
|
||||
you are running on.
|
||||
you are running on::
|
||||
|
||||
static int grow_ary(struct ipc_ids* ids, int newsize)
|
||||
{
|
||||
@@ -112,7 +122,7 @@ a simple check suffices. The pointer to the structure corresponding
|
||||
to the desired IPC object is placed in "out", with NULL indicating
|
||||
a non-existent entry. After acquiring "out->lock", the "out->deleted"
|
||||
flag indicates whether the IPC object is in the process of being
|
||||
deleted, and, if not, the pointer is returned.
|
||||
deleted, and, if not, the pointer is returned::
|
||||
|
||||
struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
|
||||
{
|
||||
@@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned.
|
||||
return out;
|
||||
}
|
||||
|
||||
.. _answer_quick_quiz_seqlock:
|
||||
|
||||
Answer to Quick Quiz:
|
||||
Why is it so important that updates be rare when using seqlock?
|
||||
|
||||
The reason that it is important that updates be rare when
|
||||
using seqlock is that frequent updates can livelock readers.
|
@@ -7,8 +7,13 @@ RCU concepts
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
arrayRCU
|
||||
rcubarrier
|
||||
rcu_dereference
|
||||
whatisRCU
|
||||
rcu
|
||||
listRCU
|
||||
NMI-RCU
|
||||
UP
|
||||
|
||||
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
|
||||
|
@@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU
|
||||
read-side critical section, which again would have suppressed the
|
||||
above lockdep-RCU splat.
|
||||
|
||||
But in this particular case, we don't actually deference the pointer
|
||||
But in this particular case, we don't actually dereference the pointer
|
||||
returned from rcu_dereference(). Instead, that pointer is just compared
|
||||
to the cic pointer, which means that the rcu_dereference() can be replaced
|
||||
by rcu_access_pointer() as follows:
|
||||
|
@@ -1,4 +1,7 @@
|
||||
.. _rcu_dereference_doc:
|
||||
|
||||
PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
|
||||
===============================================================
|
||||
|
||||
Most of the time, you can use values from rcu_dereference() or one of
|
||||
the similar primitives without worries. Dereferencing (prefix "*"),
|
||||
@@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely.
|
||||
It is nevertheless possible to get into trouble with other operations.
|
||||
Follow these rules to keep your RCU code working properly:
|
||||
|
||||
o You must use one of the rcu_dereference() family of primitives
|
||||
- You must use one of the rcu_dereference() family of primitives
|
||||
to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
|
||||
will complain. Worse yet, your code can see random memory-corruption
|
||||
bugs due to games that compilers and DEC Alpha can play.
|
||||
@@ -25,24 +28,24 @@ o You must use one of the rcu_dereference() family of primitives
|
||||
for an example where the compiler can in fact deduce the exact
|
||||
value of the pointer, and thus cause misordering.
|
||||
|
||||
o You are only permitted to use rcu_dereference on pointer values.
|
||||
- You are only permitted to use rcu_dereference on pointer values.
|
||||
The compiler simply knows too much about integral values to
|
||||
trust it to carry dependencies through integer operations.
|
||||
There are a very few exceptions, namely that you can temporarily
|
||||
cast the pointer to uintptr_t in order to:
|
||||
|
||||
o Set bits and clear bits down in the must-be-zero low-order
|
||||
- Set bits and clear bits down in the must-be-zero low-order
|
||||
bits of that pointer. This clearly means that the pointer
|
||||
must have alignment constraints, for example, this does
|
||||
-not- work in general for char* pointers.
|
||||
|
||||
o XOR bits to translate pointers, as is done in some
|
||||
- XOR bits to translate pointers, as is done in some
|
||||
classic buddy-allocator algorithms.
|
||||
|
||||
It is important to cast the value back to pointer before
|
||||
doing much of anything else with it.
|
||||
|
||||
o Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
- Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
operators. For example, for a given variable "x", avoid
|
||||
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its
|
||||
rights to substitute zero for this sort of expression, so that
|
||||
@@ -54,16 +57,16 @@ o Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
"p+a-b" is safe because its value still necessarily depends on
|
||||
the rcu_dereference(), thus maintaining proper ordering.
|
||||
|
||||
o If you are using RCU to protect JITed functions, so that the
|
||||
- If you are using RCU to protect JITed functions, so that the
|
||||
"()" function-invocation operator is applied to a value obtained
|
||||
(directly or indirectly) from rcu_dereference(), you may need to
|
||||
interact directly with the hardware to flush instruction caches.
|
||||
This issue arises on some systems when a newly JITed function is
|
||||
using the same memory that was used by an earlier JITed function.
|
||||
|
||||
o Do not use the results from relational operators ("==", "!=",
|
||||
- Do not use the results from relational operators ("==", "!=",
|
||||
">", ">=", "<", or "<=") when dereferencing. For example,
|
||||
the following (quite strange) code is buggy:
|
||||
the following (quite strange) code is buggy::
|
||||
|
||||
int *p;
|
||||
int *q;
|
||||
@@ -81,11 +84,11 @@ o Do not use the results from relational operators ("==", "!=",
|
||||
after such branches, but can speculate loads, which can again
|
||||
result in misordering bugs.
|
||||
|
||||
o Be very careful about comparing pointers obtained from
|
||||
- Be very careful about comparing pointers obtained from
|
||||
rcu_dereference() against non-NULL values. As Linus Torvalds
|
||||
explained, if the two pointers are equal, the compiler could
|
||||
substitute the pointer you are comparing against for the pointer
|
||||
obtained from rcu_dereference(). For example:
|
||||
obtained from rcu_dereference(). For example::
|
||||
|
||||
p = rcu_dereference(gp);
|
||||
if (p == &default_struct)
|
||||
@@ -93,7 +96,7 @@ o Be very careful about comparing pointers obtained from
|
||||
|
||||
Because the compiler now knows that the value of "p" is exactly
|
||||
the address of the variable "default_struct", it is free to
|
||||
transform this code into the following:
|
||||
transform this code into the following::
|
||||
|
||||
p = rcu_dereference(gp);
|
||||
if (p == &default_struct)
|
||||
@@ -105,14 +108,14 @@ o Be very careful about comparing pointers obtained from
|
||||
|
||||
However, comparisons are OK in the following cases:
|
||||
|
||||
o The comparison was against the NULL pointer. If the
|
||||
- The comparison was against the NULL pointer. If the
|
||||
compiler knows that the pointer is NULL, you had better
|
||||
not be dereferencing it anyway. If the comparison is
|
||||
non-equal, the compiler is none the wiser. Therefore,
|
||||
it is safe to compare pointers from rcu_dereference()
|
||||
against NULL pointers.
|
||||
|
||||
o The pointer is never dereferenced after being compared.
|
||||
- The pointer is never dereferenced after being compared.
|
||||
Since there are no subsequent dereferences, the compiler
|
||||
cannot use anything it learned from the comparison
|
||||
to reorder the non-existent subsequent dereferences.
|
||||
@@ -124,31 +127,31 @@ o Be very careful about comparing pointers obtained from
|
||||
dereferenced, rcu_access_pointer() should be used in place
|
||||
of rcu_dereference().
|
||||
|
||||
o The comparison is against a pointer that references memory
|
||||
- The comparison is against a pointer that references memory
|
||||
that was initialized "a long time ago." The reason
|
||||
this is safe is that even if misordering occurs, the
|
||||
misordering will not affect the accesses that follow
|
||||
the comparison. So exactly how long ago is "a long
|
||||
time ago"? Here are some possibilities:
|
||||
|
||||
o Compile time.
|
||||
- Compile time.
|
||||
|
||||
o Boot time.
|
||||
- Boot time.
|
||||
|
||||
o Module-init time for module code.
|
||||
- Module-init time for module code.
|
||||
|
||||
o Prior to kthread creation for kthread code.
|
||||
- Prior to kthread creation for kthread code.
|
||||
|
||||
o During some prior acquisition of the lock that
|
||||
- During some prior acquisition of the lock that
|
||||
we now hold.
|
||||
|
||||
o Before mod_timer() time for a timer handler.
|
||||
- Before mod_timer() time for a timer handler.
|
||||
|
||||
There are many other possibilities involving the Linux
|
||||
kernel's wide array of primitives that cause code to
|
||||
be invoked at a later time.
|
||||
|
||||
o The pointer being compared against also came from
|
||||
- The pointer being compared against also came from
|
||||
rcu_dereference(). In this case, both pointers depend
|
||||
on one rcu_dereference() or another, so you get proper
|
||||
ordering either way.
|
||||
@@ -159,13 +162,13 @@ o Be very careful about comparing pointers obtained from
|
||||
of such an RCU usage bug is shown in the section titled
|
||||
"EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
|
||||
|
||||
o All of the accesses following the comparison are stores,
|
||||
- All of the accesses following the comparison are stores,
|
||||
so that a control dependency preserves the needed ordering.
|
||||
That said, it is easy to get control dependencies wrong.
|
||||
Please see the "CONTROL DEPENDENCIES" section of
|
||||
Documentation/memory-barriers.txt for more details.
|
||||
|
||||
o The pointers are not equal -and- the compiler does
|
||||
- The pointers are not equal -and- the compiler does
|
||||
not have enough information to deduce the value of the
|
||||
pointer. Note that the volatile cast in rcu_dereference()
|
||||
will normally prevent the compiler from knowing too much.
|
||||
@@ -175,7 +178,7 @@ o Be very careful about comparing pointers obtained from
|
||||
comparison will provide exactly the information that the
|
||||
compiler needs to deduce the value of the pointer.
|
||||
|
||||
o Disable any value-speculation optimizations that your compiler
|
||||
- Disable any value-speculation optimizations that your compiler
|
||||
might provide, especially if you are making use of feedback-based
|
||||
optimizations that take data collected from prior runs. Such
|
||||
value-speculation optimizations reorder operations by design.
|
||||
@@ -188,11 +191,12 @@ o Disable any value-speculation optimizations that your compiler
|
||||
|
||||
|
||||
EXAMPLE OF AMPLIFIED RCU-USAGE BUG
|
||||
----------------------------------
|
||||
|
||||
Because updaters can run concurrently with RCU readers, RCU readers can
|
||||
see stale and/or inconsistent values. If RCU readers need fresh or
|
||||
consistent values, which they sometimes do, they need to take proper
|
||||
precautions. To see this, consider the following code fragment:
|
||||
precautions. To see this, consider the following code fragment::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point.
|
||||
|
||||
But suppose that the reader needs a consistent view?
|
||||
|
||||
Then one approach is to use locking, for example, as follows:
|
||||
Then one approach is to use locking, for example, as follows::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@@ -299,6 +303,7 @@ As always, use the right tool for the job!
|
||||
|
||||
|
||||
EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
|
||||
-----------------------------------------
|
||||
|
||||
If a pointer obtained from rcu_dereference() compares not-equal to some
|
||||
other pointer, the compiler normally has no clue what the value of the
|
||||
@@ -308,7 +313,7 @@ guarantees that RCU depends on. And the volatile cast in rcu_dereference()
|
||||
should prevent the compiler from guessing the value.
|
||||
|
||||
But without rcu_dereference(), the compiler knows more than you might
|
||||
expect. Consider the following code fragment:
|
||||
expect. Consider the following code fragment::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@@ -354,6 +359,7 @@ dereference the resulting pointer.
|
||||
|
||||
|
||||
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||
------------------------------------------------------------
|
||||
|
||||
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||
@@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
2. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by (say) my_lock on the other,
|
||||
use rcu_dereference_check(), for example:
|
||||
use rcu_dereference_check(), for example::
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
@@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
3. If the access might be within an RCU read-side critical section
|
||||
on the one hand, or protected by either my_lock or your_lock on
|
||||
the other, again use rcu_dereference_check(), for example:
|
||||
the other, again use rcu_dereference_check(), for example::
|
||||
|
||||
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock) ||
|
||||
lockdep_is_held(&your_lock));
|
||||
|
||||
4. If the access is on the update side, so that it is always protected
|
||||
by my_lock, use rcu_dereference_protected():
|
||||
by my_lock, use rcu_dereference_protected()::
|
||||
|
||||
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||
lockdep_is_held(&my_lock));
|
||||
@@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations:
|
||||
|
||||
|
||||
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||
-----------------------------------------
|
||||
|
||||
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||
pointers, which can result in "interesting" bugs due to compiler
|
||||
optimizations involving invented loads and perhaps also load tearing.
|
||||
For example, suppose someone mistakenly does something like this:
|
||||
For example, suppose someone mistakenly does something like this::
|
||||
|
||||
p = q->rcu_protected_pointer;
|
||||
do_something_with(p->a);
|
||||
do_something_else_with(p->b);
|
||||
|
||||
If register pressure is high, the compiler might optimize "p" out
|
||||
of existence, transforming the code to something like this:
|
||||
of existence, transforming the code to something like this::
|
||||
|
||||
do_something_with(q->rcu_protected_pointer->a);
|
||||
do_something_else_with(q->rcu_protected_pointer->b);
|
||||
@@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair
|
||||
of pointers, which also might fatally disappoint your code.
|
||||
|
||||
These problems could have been avoided simply by making the code instead
|
||||
read as follows:
|
||||
read as follows::
|
||||
|
||||
p = rcu_dereference(q->rcu_protected_pointer);
|
||||
do_something_with(p->a);
|
||||
@@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if
|
||||
this pointer is accessed directly. It will also cause sparse to complain
|
||||
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||
follows:
|
||||
follows::
|
||||
|
||||
struct foo __rcu *rcu_protected_pointer;
|
||||
|
@@ -1,4 +1,7 @@
|
||||
.. _rcu_barrier:
|
||||
|
||||
RCU and Unloadable Modules
|
||||
==========================
|
||||
|
||||
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
|
||||
|
||||
@@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their
|
||||
presence? There is a synchronize_rcu() primitive that blocks until all
|
||||
pre-existing readers have completed. An updater wishing to delete an
|
||||
element p from a linked list might do the following, while holding an
|
||||
appropriate lock, of course:
|
||||
appropriate lock, of course::
|
||||
|
||||
list_del_rcu(p);
|
||||
synchronize_rcu();
|
||||
@@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an
|
||||
rcu_head struct placed within the RCU-protected data structure and
|
||||
another pointer to a function that may be invoked later to free that
|
||||
structure. Code to delete an element p from the linked list from IRQ
|
||||
context might then be as follows:
|
||||
context might then be as follows::
|
||||
|
||||
list_del_rcu(p);
|
||||
call_rcu(&p->rcu, p_callback);
|
||||
|
||||
Since call_rcu() never blocks, this code can safely be used from within
|
||||
IRQ context. The function p_callback() might be defined as follows:
|
||||
IRQ context. The function p_callback() might be defined as follows::
|
||||
|
||||
static void p_callback(struct rcu_head *rp)
|
||||
{
|
||||
@@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows:
|
||||
|
||||
|
||||
Unloading Modules That Use call_rcu()
|
||||
-------------------------------------
|
||||
|
||||
But what if p_callback is defined in an unloadable module?
|
||||
|
||||
@@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies.
|
||||
|
||||
|
||||
rcu_barrier()
|
||||
-------------
|
||||
|
||||
We instead need the rcu_barrier() primitive. Rather than waiting for
|
||||
a grace period to elapse, rcu_barrier() waits for all outstanding RCU
|
||||
callbacks to complete. Please note that rcu_barrier() does -not- imply
|
||||
callbacks to complete. Please note that rcu_barrier() does **not** imply
|
||||
synchronize_rcu(), in particular, if there are no RCU callbacks queued
|
||||
anywhere, rcu_barrier() is within its rights to return immediately,
|
||||
without waiting for a grace period to elapse.
|
||||
@@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||
flavors of rcu_barrier() when unloading that module. For example, if
|
||||
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||
srcu_struct_2(), then the following three lines of code will be required
|
||||
when unloading:
|
||||
srcu_struct_2, then the following three lines of code will be required
|
||||
when unloading::
|
||||
|
||||
1 rcu_barrier();
|
||||
2 srcu_barrier(&srcu_struct_1);
|
||||
3 srcu_barrier(&srcu_struct_2);
|
||||
|
||||
The rcutorture module makes use of rcu_barrier() in its exit function
|
||||
as follows:
|
||||
as follows::
|
||||
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
5
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
||||
9 kthread_stop(shuffler_task);
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
12
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
18
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
32
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
45
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
51
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
54
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
56
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
12
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
18
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
32
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
45
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
51
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
54
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
56
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
|
||||
Line 6 sets a global variable that prevents any RCU callbacks from
|
||||
re-posting themselves. This will not be necessary in most cases, since
|
||||
@@ -176,9 +181,14 @@ for any pre-existing callbacks to complete.
|
||||
Then lines 55-62 print status and do operation-specific cleanup, and
|
||||
then return, permitting the module-unload operation to be completed.
|
||||
|
||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
||||
.. _rcubarrier_quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Is there any other situation where rcu_barrier() might
|
||||
be required?
|
||||
|
||||
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
|
||||
|
||||
Your module might have additional complications. For example, if your
|
||||
module invokes call_rcu() from timers, you will need to first cancel all
|
||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||
@@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke
|
||||
rcu_barrier() before unloading. Similarly, if your module uses
|
||||
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
||||
**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
|
||||
srcu_barrier().
|
||||
|
||||
|
||||
Implementing rcu_barrier()
|
||||
--------------------------
|
||||
|
||||
Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
|
||||
that RCU callbacks are never reordered once queued on one of the per-CPU
|
||||
@@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU
|
||||
callback queues, and then waits until they have all started executing, at
|
||||
which point, all earlier RCU callbacks are guaranteed to have completed.
|
||||
|
||||
The original code for rcu_barrier() was as follows:
|
||||
The original code for rcu_barrier() was as follows::
|
||||
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 wait_for_completion(&rcu_barrier_completion);
|
||||
10 mutex_unlock(&rcu_barrier_mutex);
|
||||
11 }
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 wait_for_completion(&rcu_barrier_completion);
|
||||
10 mutex_unlock(&rcu_barrier_mutex);
|
||||
11 }
|
||||
|
||||
Line 3 verifies that the caller is in process context, and lines 5 and 10
|
||||
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
|
||||
@@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this
|
||||
still gives the general idea.
|
||||
|
||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||
to post an RCU callback, as follows:
|
||||
to post an RCU callback, as follows::
|
||||
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
6
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
|
||||
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
|
||||
which contains the struct rcu_head that needed for the later call to
|
||||
@@ -248,20 +259,25 @@ the current CPU's queue.
|
||||
|
||||
The rcu_barrier_callback() function simply atomically decrements the
|
||||
rcu_barrier_cpu_count variable and finalizes the completion when it
|
||||
reaches zero, as follows:
|
||||
reaches zero, as follows::
|
||||
|
||||
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
||||
2 {
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
5 }
|
||||
|
||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
||||
.. _rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
rcu_barrier() returning prematurely?
|
||||
|
||||
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
|
||||
|
||||
The current rcu_barrier() implementation is more complex, due to the need
|
||||
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
||||
and the need to minimally disturb non-idle CPUs in real-time systems.
|
||||
@@ -269,6 +285,7 @@ However, the code above illustrates the concepts.
|
||||
|
||||
|
||||
rcu_barrier() Summary
|
||||
---------------------
|
||||
|
||||
The rcu_barrier() primitive has seen relatively little use, since most
|
||||
code using RCU is in the core kernel rather than in modules. However, if
|
||||
@@ -277,8 +294,12 @@ so that your module may be safely unloaded.
|
||||
|
||||
|
||||
Answers to Quick Quizzes
|
||||
------------------------
|
||||
|
||||
Quick Quiz #1: Is there any other situation where rcu_barrier() might
|
||||
.. _answer_rcubarrier_quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Is there any other situation where rcu_barrier() might
|
||||
be required?
|
||||
|
||||
Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
@@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
implementing rcutorture, and found that rcu_barrier() solves
|
||||
this problem as well.
|
||||
|
||||
Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
||||
:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
|
||||
|
||||
.. _answer_rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
@@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
|
||||
is to add an rcu_read_lock() before line 8 of rcu_barrier()
|
||||
and an rcu_read_unlock() after line 8 of this same function. If
|
||||
you can think of a better change, please let me know!
|
||||
|
||||
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
|
@@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
||||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||
for each CPU:
|
||||
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
|
||||
|
||||
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
||||
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
||||
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
||||
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
|
||||
status, so that an "l" indicates that all callbacks were lazy at the start
|
||||
of the last idle period and an "L" indicates that there are currently
|
||||
no non-lazy callbacks (in both cases, "." is printed otherwise, as
|
||||
shown above) and "D" indicates that dyntick-idle processing is enabled
|
||||
("." is printed otherwise, for example, if disabled via the "nohz="
|
||||
kernel boot parameter).
|
||||
rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
|
||||
processing is enabled.
|
||||
|
||||
If the grace period ends just as the stall warning starts printing,
|
||||
there will be a spurious stall-warning message, which will include
|
||||
|
@@ -1,15 +1,18 @@
|
||||
.. _whatisrcu_doc:
|
||||
|
||||
What is RCU? -- "Read, Copy, Update"
|
||||
======================================
|
||||
|
||||
Please note that the "What is RCU?" LWN series is an excellent place
|
||||
to start learning about RCU:
|
||||
|
||||
1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||
2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||
3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||
4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||
2010 Big API Table http://lwn.net/Articles/419086/
|
||||
5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||
2014 Big API Table http://lwn.net/Articles/609973/
|
||||
| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||
| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||
| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||
| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||
| 2010 Big API Table http://lwn.net/Articles/419086/
|
||||
| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||
| 2014 Big API Table http://lwn.net/Articles/609973/
|
||||
|
||||
|
||||
What is RCU?
|
||||
@@ -24,14 +27,21 @@ the experience has been that different people must take different paths
|
||||
to arrive at an understanding of RCU. This document provides several
|
||||
different paths, as follows:
|
||||
|
||||
1. RCU OVERVIEW
|
||||
2. WHAT IS RCU'S CORE API?
|
||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
||||
6. ANALOGY WITH READER-WRITER LOCKING
|
||||
7. FULL LIST OF RCU APIs
|
||||
8. ANSWERS TO QUICK QUIZZES
|
||||
:ref:`1. RCU OVERVIEW <1_whatisRCU>`
|
||||
|
||||
:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>`
|
||||
|
||||
:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
|
||||
|
||||
:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
|
||||
|
||||
:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
|
||||
|
||||
:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
|
||||
|
||||
:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
|
||||
|
||||
:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
|
||||
|
||||
People who prefer starting with a conceptual overview should focus on
|
||||
Section 1, though most readers will profit by reading this section at
|
||||
@@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really
|
||||
that type of person, you have perused the source code and will therefore
|
||||
never need this document anyway. ;-)
|
||||
|
||||
.. _1_whatisRCU:
|
||||
|
||||
1. RCU OVERVIEW
|
||||
----------------
|
||||
|
||||
The basic idea behind RCU is to split updates into "removal" and
|
||||
"reclamation" phases. The removal phase removes references to data items
|
||||
@@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given
|
||||
that readers are not doing any sort of synchronization operations???
|
||||
Read on to learn about how RCU's API makes this easy.
|
||||
|
||||
.. _2_whatisRCU:
|
||||
|
||||
2. WHAT IS RCU'S CORE API?
|
||||
---------------------------
|
||||
|
||||
The core RCU API is quite small:
|
||||
|
||||
@@ -136,7 +150,7 @@ later. See the kernel docbook documentation for more info, or look directly
|
||||
at the function header comments.
|
||||
|
||||
rcu_read_lock()
|
||||
|
||||
^^^^^^^^^^^^^^^
|
||||
void rcu_read_lock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
@@ -150,7 +164,7 @@ rcu_read_lock()
|
||||
longer-term references to data structures.
|
||||
|
||||
rcu_read_unlock()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void rcu_read_unlock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
@@ -158,15 +172,15 @@ rcu_read_unlock()
|
||||
read-side critical sections may be nested and/or overlapping.
|
||||
|
||||
synchronize_rcu()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void synchronize_rcu(void);
|
||||
|
||||
Marks the end of updater code and the beginning of reclaimer
|
||||
code. It does this by blocking until all pre-existing RCU
|
||||
read-side critical sections on all CPUs have completed.
|
||||
Note that synchronize_rcu() will -not- necessarily wait for
|
||||
Note that synchronize_rcu() will **not** necessarily wait for
|
||||
any subsequent RCU read-side critical sections to complete.
|
||||
For example, consider the following sequence of events:
|
||||
For example, consider the following sequence of events::
|
||||
|
||||
CPU 0 CPU 1 CPU 2
|
||||
----------------- ------------------------- ---------------
|
||||
@@ -182,7 +196,7 @@ synchronize_rcu()
|
||||
any that begin after synchronize_rcu() is invoked.
|
||||
|
||||
Of course, synchronize_rcu() does not necessarily return
|
||||
-immediately- after the last pre-existing RCU read-side critical
|
||||
**immediately** after the last pre-existing RCU read-side critical
|
||||
section completes. For one thing, there might well be scheduling
|
||||
delays. For another thing, many RCU implementations process
|
||||
requests in batches in order to improve efficiencies, which can
|
||||
@@ -211,10 +225,10 @@ synchronize_rcu()
|
||||
checklist.txt for some approaches to limiting the update rate.
|
||||
|
||||
rcu_assign_pointer()
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
void rcu_assign_pointer(p, typeof(p) v);
|
||||
|
||||
Yes, rcu_assign_pointer() -is- implemented as a macro, though it
|
||||
Yes, rcu_assign_pointer() **is** implemented as a macro, though it
|
||||
would be cool to be able to declare a function in this manner.
|
||||
(Compiler experts will no doubt disagree.)
|
||||
|
||||
@@ -231,7 +245,7 @@ rcu_assign_pointer()
|
||||
the _rcu list-manipulation primitives such as list_add_rcu().
|
||||
|
||||
rcu_dereference()
|
||||
|
||||
^^^^^^^^^^^^^^^^^
|
||||
typeof(p) rcu_dereference(p);
|
||||
|
||||
Like rcu_assign_pointer(), rcu_dereference() must be implemented
|
||||
@@ -248,13 +262,13 @@ rcu_dereference()
|
||||
|
||||
Common coding practice uses rcu_dereference() to copy an
|
||||
RCU-protected pointer to a local variable, then dereferences
|
||||
this local variable, for example as follows:
|
||||
this local variable, for example as follows::
|
||||
|
||||
p = rcu_dereference(head.next);
|
||||
return p->data;
|
||||
|
||||
However, in this case, one could just as easily combine these
|
||||
into one statement:
|
||||
into one statement::
|
||||
|
||||
return rcu_dereference(head.next)->data;
|
||||
|
||||
@@ -266,8 +280,8 @@ rcu_dereference()
|
||||
unnecessary overhead on Alpha CPUs.
|
||||
|
||||
Note that the value returned by rcu_dereference() is valid
|
||||
only within the enclosing RCU read-side critical section [1].
|
||||
For example, the following is -not- legal:
|
||||
only within the enclosing RCU read-side critical section [1]_.
|
||||
For example, the following is **not** legal::
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(head.next);
|
||||
@@ -290,9 +304,9 @@ rcu_dereference()
|
||||
at any time, including immediately after the rcu_dereference().
|
||||
And, again like rcu_assign_pointer(), rcu_dereference() is
|
||||
typically used indirectly, via the _rcu list-manipulation
|
||||
primitives, such as list_for_each_entry_rcu() [2].
|
||||
primitives, such as list_for_each_entry_rcu() [2]_.
|
||||
|
||||
[1] The variant rcu_dereference_protected() can be used outside
|
||||
.. [1] The variant rcu_dereference_protected() can be used outside
|
||||
of an RCU read-side critical section as long as the usage is
|
||||
protected by locks acquired by the update-side code. This variant
|
||||
avoids the lockdep warning that would happen when using (for
|
||||
@@ -305,7 +319,7 @@ rcu_dereference()
|
||||
a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
|
||||
and the API's code comments for more details and example usage.
|
||||
|
||||
[2] If the list_for_each_entry_rcu() instance might be used by
|
||||
.. [2] If the list_for_each_entry_rcu() instance might be used by
|
||||
update-side code as well as by RCU readers, then an additional
|
||||
lockdep expression can be added to its list of arguments.
|
||||
For example, given an additional "lock_is_held(&mylock)" argument,
|
||||
@@ -315,6 +329,7 @@ rcu_dereference()
|
||||
|
||||
The following diagram shows how each API communicates among the
|
||||
reader, updater, and reclaimer.
|
||||
::
|
||||
|
||||
|
||||
rcu_assign_pointer()
|
||||
@@ -375,12 +390,16 @@ c. RCU applied to scheduler and interrupt/NMI-handler tasks.
|
||||
Again, most uses will be of (a). The (b) and (c) cases are important
|
||||
for specialized uses, but are relatively uncommon.
|
||||
|
||||
.. _3_whatisRCU:
|
||||
|
||||
3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
|
||||
-----------------------------------------------
|
||||
|
||||
This section shows a simple use of the core RCU API to protect a
|
||||
global pointer to a dynamically allocated structure. More-typical
|
||||
uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
||||
uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
|
||||
:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
|
||||
::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
|
||||
|
||||
So, to sum up:
|
||||
|
||||
o Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
||||
- Use rcu_read_lock() and rcu_read_unlock() to guard RCU
|
||||
read-side critical sections.
|
||||
|
||||
o Within an RCU read-side critical section, use rcu_dereference()
|
||||
- Within an RCU read-side critical section, use rcu_dereference()
|
||||
to dereference RCU-protected pointers.
|
||||
|
||||
o Use some solid scheme (such as locks or semaphores) to
|
||||
- Use some solid scheme (such as locks or semaphores) to
|
||||
keep concurrent updates from interfering with each other.
|
||||
|
||||
o Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||
- Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||
This primitive protects concurrent readers from the updater,
|
||||
-not- concurrent updates from each other! You therefore still
|
||||
**not** concurrent updates from each other! You therefore still
|
||||
need to use locking (or something similar) to keep concurrent
|
||||
rcu_assign_pointer() primitives from interfering with each other.
|
||||
|
||||
o Use synchronize_rcu() -after- removing a data element from an
|
||||
RCU-protected data structure, but -before- reclaiming/freeing
|
||||
- Use synchronize_rcu() **after** removing a data element from an
|
||||
RCU-protected data structure, but **before** reclaiming/freeing
|
||||
the data element, in order to wait for the completion of all
|
||||
RCU read-side critical sections that might be referencing that
|
||||
data item.
|
||||
|
||||
See checklist.txt for additional rules to follow when using RCU.
|
||||
And again, more-typical uses of RCU may be found in listRCU.txt,
|
||||
arrayRCU.txt, and NMI-RCU.txt.
|
||||
And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
|
||||
<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
|
||||
<NMI_rcu_doc>`.
|
||||
|
||||
.. _4_whatisRCU:
|
||||
|
||||
4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
|
||||
--------------------------------------------
|
||||
|
||||
In the example above, foo_update_a() blocks until a grace period elapses.
|
||||
This is quite simple, but in some cases one cannot afford to wait so
|
||||
long -- there might be other high-priority work to be done.
|
||||
|
||||
In such cases, one uses call_rcu() rather than synchronize_rcu().
|
||||
The call_rcu() API is as follows:
|
||||
The call_rcu() API is as follows::
|
||||
|
||||
void call_rcu(struct rcu_head * head,
|
||||
void (*func)(struct rcu_head *head));
|
||||
@@ -481,7 +503,7 @@ The call_rcu() API is as follows:
|
||||
This function invokes func(head) after a grace period has elapsed.
|
||||
This invocation might happen from either softirq or process context,
|
||||
so the function is not permitted to block. The foo struct needs to
|
||||
have an rcu_head structure added, perhaps as follows:
|
||||
have an rcu_head structure added, perhaps as follows::
|
||||
|
||||
struct foo {
|
||||
int a;
|
||||
@@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows:
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
The foo_update_a() function might then be written as follows:
|
||||
The foo_update_a() function might then be written as follows::
|
||||
|
||||
/*
|
||||
* Create a new struct foo that is the same as the one currently
|
||||
@@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows:
|
||||
call_rcu(&old_fp->rcu, foo_reclaim);
|
||||
}
|
||||
|
||||
The foo_reclaim() function might appear as follows:
|
||||
The foo_reclaim() function might appear as follows::
|
||||
|
||||
void foo_reclaim(struct rcu_head *rp)
|
||||
{
|
||||
@@ -544,7 +566,7 @@ namely foo_reclaim().
|
||||
The summary of advice is the same as for the previous section, except
|
||||
that we are now using call_rcu() rather than synchronize_rcu():
|
||||
|
||||
o Use call_rcu() -after- removing a data element from an
|
||||
- Use call_rcu() **after** removing a data element from an
|
||||
RCU-protected data structure in order to register a callback
|
||||
function that will be invoked after the completion of all RCU
|
||||
read-side critical sections that might be referencing that
|
||||
@@ -552,14 +574,16 @@ o Use call_rcu() -after- removing a data element from an
|
||||
|
||||
If the callback for call_rcu() is not doing anything more than calling
|
||||
kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
|
||||
to avoid having to write your own callback:
|
||||
to avoid having to write your own callback::
|
||||
|
||||
kfree_rcu(old_fp, rcu);
|
||||
|
||||
Again, see checklist.txt for additional rules governing the use of RCU.
|
||||
|
||||
.. _5_whatisRCU:
|
||||
|
||||
5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
|
||||
------------------------------------------------
|
||||
|
||||
One of the nice things about RCU is that it has extremely simple "toy"
|
||||
implementations that are a good first step towards understanding the
|
||||
@@ -579,7 +603,7 @@ more details on the current implementation as of early 2004.
|
||||
|
||||
|
||||
5A. "TOY" IMPLEMENTATION #1: LOCKING
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
This section presents a "toy" RCU implementation that is based on
|
||||
familiar locking primitives. Its overhead makes it a non-starter for
|
||||
real-life use, as does its lack of scalability. It is also unsuitable
|
||||
@@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock.
|
||||
However, it is probably the easiest implementation to relate to, so is
|
||||
a good starting point.
|
||||
|
||||
It is extremely simple:
|
||||
It is extremely simple::
|
||||
|
||||
static DEFINE_RWLOCK(rcu_gp_mutex);
|
||||
|
||||
@@ -614,7 +638,7 @@ It is extremely simple:
|
||||
|
||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
||||
much. But here are simplified versions anyway. And whatever you do,
|
||||
don't forget about them when submitting patches making use of RCU!]
|
||||
don't forget about them when submitting patches making use of RCU!]::
|
||||
|
||||
#define rcu_assign_pointer(p, v) \
|
||||
({ \
|
||||
@@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu().
|
||||
But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
|
||||
so there can be no deadlock cycle.
|
||||
|
||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
||||
.. _quiz_1:
|
||||
|
||||
Quick Quiz #1:
|
||||
Why is this argument naive? How could a deadlock
|
||||
occur when using this algorithm in a real-world Linux
|
||||
kernel? How could this deadlock be avoided?
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
5B. "TOY" EXAMPLE #2: CLASSIC RCU
|
||||
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
This section presents a "toy" RCU implementation that is based on
|
||||
"classic RCU". It is also short on performance (but only for updates) and
|
||||
on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
|
||||
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
|
||||
are the same as those shown in the preceding section, so they are omitted.
|
||||
::
|
||||
|
||||
void rcu_read_lock(void) { }
|
||||
|
||||
@@ -683,14 +712,14 @@ CPU in turn. The run_on() primitive can be implemented straightforwardly
|
||||
in terms of the sched_setaffinity() primitive. Of course, a somewhat less
|
||||
"toy" implementation would restore the affinity upon completion rather
|
||||
than just leaving all tasks running on the last CPU, but when I said
|
||||
"toy", I meant -toy-!
|
||||
"toy", I meant **toy**!
|
||||
|
||||
So how the heck is this supposed to work???
|
||||
|
||||
Remember that it is illegal to block while in an RCU read-side critical
|
||||
section. Therefore, if a given CPU executes a context switch, we know
|
||||
that it must have completed all preceding RCU read-side critical sections.
|
||||
Once -all- CPUs have executed a context switch, then -all- preceding
|
||||
Once **all** CPUs have executed a context switch, then **all** preceding
|
||||
RCU read-side critical sections will have completed.
|
||||
|
||||
So, suppose that we remove a data item from its structure and then invoke
|
||||
@@ -698,19 +727,32 @@ synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
|
||||
that there are no RCU read-side critical sections holding a reference
|
||||
to that data item, so we can safely reclaim it.
|
||||
|
||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
||||
overhead is -negative-.
|
||||
.. _quiz_2:
|
||||
|
||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
||||
Quick Quiz #2:
|
||||
Give an example where Classic RCU's read-side
|
||||
overhead is **negative**.
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
.. _quiz_3:
|
||||
|
||||
Quick Quiz #3:
|
||||
If it is illegal to block in an RCU read-side
|
||||
critical section, what the heck do you do in
|
||||
PREEMPT_RT, where normal spinlocks can block???
|
||||
|
||||
:ref:`Answers to Quick Quiz <8_whatisRCU>`
|
||||
|
||||
.. _6_whatisRCU:
|
||||
|
||||
6. ANALOGY WITH READER-WRITER LOCKING
|
||||
--------------------------------------
|
||||
|
||||
Although RCU can be used in many different ways, a very common use of
|
||||
RCU is analogous to reader-writer locking. The following unified
|
||||
diff shows how closely related RCU and reader-writer locking can be.
|
||||
::
|
||||
|
||||
@@ -5,5 +5,5 @@ struct el {
|
||||
int data;
|
||||
@@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be.
|
||||
return 0;
|
||||
}
|
||||
|
||||
Or, for those who prefer a side-by-side listing:
|
||||
Or, for those who prefer a side-by-side listing::
|
||||
|
||||
1 struct el { 1 struct el {
|
||||
2 struct list_head list; 2 struct list_head list;
|
||||
@@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing:
|
||||
8 rwlock_t listmutex; 8 spinlock_t listmutex;
|
||||
9 struct el head; 9 struct el head;
|
||||
|
||||
1 int search(long key, int *result) 1 int search(long key, int *result)
|
||||
2 { 2 {
|
||||
3 struct list_head *lp; 3 struct list_head *lp;
|
||||
4 struct el *p; 4 struct el *p;
|
||||
5 5
|
||||
6 read_lock(&listmutex); 6 rcu_read_lock();
|
||||
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
||||
8 if (p->key == key) { 8 if (p->key == key) {
|
||||
9 *result = p->data; 9 *result = p->data;
|
||||
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
||||
11 return 1; 11 return 1;
|
||||
12 } 12 }
|
||||
13 } 13 }
|
||||
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
||||
15 return 0; 15 return 0;
|
||||
16 } 16 }
|
||||
::
|
||||
|
||||
1 int delete(long key) 1 int delete(long key)
|
||||
2 { 2 {
|
||||
3 struct el *p; 3 struct el *p;
|
||||
4 4
|
||||
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
||||
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
||||
7 if (p->key == key) { 7 if (p->key == key) {
|
||||
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
||||
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
||||
10 synchronize_rcu();
|
||||
10 kfree(p); 11 kfree(p);
|
||||
11 return 1; 12 return 1;
|
||||
12 } 13 }
|
||||
13 } 14 }
|
||||
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
||||
15 return 0; 16 return 0;
|
||||
16 } 17 }
|
||||
1 int search(long key, int *result) 1 int search(long key, int *result)
|
||||
2 { 2 {
|
||||
3 struct list_head *lp; 3 struct list_head *lp;
|
||||
4 struct el *p; 4 struct el *p;
|
||||
5 5
|
||||
6 read_lock(&listmutex); 6 rcu_read_lock();
|
||||
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
|
||||
8 if (p->key == key) { 8 if (p->key == key) {
|
||||
9 *result = p->data; 9 *result = p->data;
|
||||
10 read_unlock(&listmutex); 10 rcu_read_unlock();
|
||||
11 return 1; 11 return 1;
|
||||
12 } 12 }
|
||||
13 } 13 }
|
||||
14 read_unlock(&listmutex); 14 rcu_read_unlock();
|
||||
15 return 0; 15 return 0;
|
||||
16 } 16 }
|
||||
|
||||
::
|
||||
|
||||
1 int delete(long key) 1 int delete(long key)
|
||||
2 { 2 {
|
||||
3 struct el *p; 3 struct el *p;
|
||||
4 4
|
||||
5 write_lock(&listmutex); 5 spin_lock(&listmutex);
|
||||
6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
|
||||
7 if (p->key == key) { 7 if (p->key == key) {
|
||||
8 list_del(&p->list); 8 list_del_rcu(&p->list);
|
||||
9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
|
||||
10 synchronize_rcu();
|
||||
10 kfree(p); 11 kfree(p);
|
||||
11 return 1; 12 return 1;
|
||||
12 } 13 }
|
||||
13 } 14 }
|
||||
14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
|
||||
15 return 0; 16 return 0;
|
||||
16 } 17 }
|
||||
|
||||
Either way, the differences are quite small. Read-side locking moves
|
||||
to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
|
||||
@@ -825,22 +871,27 @@ delete() can now block. If this is a problem, there is a callback-based
|
||||
mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
|
||||
be used in place of synchronize_rcu().
|
||||
|
||||
.. _7_whatisRCU:
|
||||
|
||||
7. FULL LIST OF RCU APIs
|
||||
-------------------------
|
||||
|
||||
The RCU APIs are documented in docbook-format header comments in the
|
||||
Linux-kernel source code, but it helps to have a full list of the
|
||||
APIs, since there does not appear to be a way to categorize them
|
||||
in docbook. Here is the list, by category.
|
||||
|
||||
RCU list traversal:
|
||||
RCU list traversal::
|
||||
|
||||
list_entry_rcu
|
||||
list_entry_lockless
|
||||
list_first_entry_rcu
|
||||
list_next_rcu
|
||||
list_for_each_entry_rcu
|
||||
list_for_each_entry_continue_rcu
|
||||
list_for_each_entry_from_rcu
|
||||
list_first_or_null_rcu
|
||||
list_next_or_null_rcu
|
||||
hlist_first_rcu
|
||||
hlist_next_rcu
|
||||
hlist_pprev_rcu
|
||||
@@ -854,7 +905,7 @@ RCU list traversal:
|
||||
hlist_bl_first_rcu
|
||||
hlist_bl_for_each_entry_rcu
|
||||
|
||||
RCU pointer/list update:
|
||||
RCU pointer/list update::
|
||||
|
||||
rcu_assign_pointer
|
||||
list_add_rcu
|
||||
@@ -864,10 +915,12 @@ RCU pointer/list update:
|
||||
hlist_add_behind_rcu
|
||||
hlist_add_before_rcu
|
||||
hlist_add_head_rcu
|
||||
hlist_add_tail_rcu
|
||||
hlist_del_rcu
|
||||
hlist_del_init_rcu
|
||||
hlist_replace_rcu
|
||||
list_splice_init_rcu()
|
||||
list_splice_init_rcu
|
||||
list_splice_tail_init_rcu
|
||||
hlist_nulls_del_init_rcu
|
||||
hlist_nulls_del_rcu
|
||||
hlist_nulls_add_head_rcu
|
||||
@@ -876,7 +929,9 @@ RCU pointer/list update:
|
||||
hlist_bl_del_rcu
|
||||
hlist_bl_set_first_rcu
|
||||
|
||||
RCU: Critical sections Grace period Barrier
|
||||
RCU::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock synchronize_net rcu_barrier
|
||||
rcu_read_unlock synchronize_rcu
|
||||
@@ -885,7 +940,9 @@ RCU: Critical sections Grace period Barrier
|
||||
rcu_dereference_check kfree_rcu
|
||||
rcu_dereference_protected
|
||||
|
||||
bh: Critical sections Grace period Barrier
|
||||
bh::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_bh call_rcu rcu_barrier
|
||||
rcu_read_unlock_bh synchronize_rcu
|
||||
@@ -896,7 +953,9 @@ bh: Critical sections Grace period Barrier
|
||||
rcu_dereference_bh_protected
|
||||
rcu_read_lock_bh_held
|
||||
|
||||
sched: Critical sections Grace period Barrier
|
||||
sched::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_sched call_rcu rcu_barrier
|
||||
rcu_read_unlock_sched synchronize_rcu
|
||||
@@ -910,7 +969,9 @@ sched: Critical sections Grace period Barrier
|
||||
rcu_read_lock_sched_held
|
||||
|
||||
|
||||
SRCU: Critical sections Grace period Barrier
|
||||
SRCU::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
srcu_read_lock call_srcu srcu_barrier
|
||||
srcu_read_unlock synchronize_srcu
|
||||
@@ -918,13 +979,14 @@ SRCU: Critical sections Grace period Barrier
|
||||
srcu_dereference_check
|
||||
srcu_read_lock_held
|
||||
|
||||
SRCU: Initialization/cleanup
|
||||
SRCU: Initialization/cleanup::
|
||||
|
||||
DEFINE_SRCU
|
||||
DEFINE_STATIC_SRCU
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
All: lockdep-checked RCU-protected pointer access
|
||||
All: lockdep-checked RCU-protected pointer access::
|
||||
|
||||
rcu_access_pointer
|
||||
rcu_dereference_raw
|
||||
@@ -974,15 +1036,19 @@ g. Otherwise, use RCU.
|
||||
Of course, this all assumes that you have determined that RCU is in fact
|
||||
the right tool for your job.
|
||||
|
||||
.. _8_whatisRCU:
|
||||
|
||||
8. ANSWERS TO QUICK QUIZZES
|
||||
----------------------------
|
||||
|
||||
Quick Quiz #1: Why is this argument naive? How could a deadlock
|
||||
Quick Quiz #1:
|
||||
Why is this argument naive? How could a deadlock
|
||||
occur when using this algorithm in a real-world Linux
|
||||
kernel? [Referring to the lock-based "toy" RCU
|
||||
algorithm.]
|
||||
|
||||
Answer: Consider the following sequence of events:
|
||||
Answer:
|
||||
Consider the following sequence of events:
|
||||
|
||||
1. CPU 0 acquires some unrelated lock, call it
|
||||
"problematic_lock", disabling irq via
|
||||
@@ -1021,10 +1087,14 @@ Answer: Consider the following sequence of events:
|
||||
approach where tasks in RCU read-side critical sections
|
||||
cannot be blocked by tasks executing synchronize_rcu().
|
||||
|
||||
Quick Quiz #2: Give an example where Classic RCU's read-side
|
||||
overhead is -negative-.
|
||||
:ref:`Back to Quick Quiz #1 <quiz_1>`
|
||||
|
||||
Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
Quick Quiz #2:
|
||||
Give an example where Classic RCU's read-side
|
||||
overhead is **negative**.
|
||||
|
||||
Answer:
|
||||
Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
kernel where a routing table is used by process-context
|
||||
code, but can be updated by irq-context code (for example,
|
||||
by an "ICMP REDIRECT" packet). The usual way of handling
|
||||
@@ -1046,11 +1116,15 @@ Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
|
||||
even the theoretical possibility of negative overhead for
|
||||
a synchronization primitive is a bit unexpected. ;-)
|
||||
|
||||
Quick Quiz #3: If it is illegal to block in an RCU read-side
|
||||
:ref:`Back to Quick Quiz #2 <quiz_2>`
|
||||
|
||||
Quick Quiz #3:
|
||||
If it is illegal to block in an RCU read-side
|
||||
critical section, what the heck do you do in
|
||||
PREEMPT_RT, where normal spinlocks can block???
|
||||
|
||||
Answer: Just as PREEMPT_RT permits preemption of spinlock
|
||||
Answer:
|
||||
Just as PREEMPT_RT permits preemption of spinlock
|
||||
critical sections, it permits preemption of RCU
|
||||
read-side critical sections. It also permits
|
||||
spinlocks blocking while in RCU read-side critical
|
||||
@@ -1069,6 +1143,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
|
||||
Besides, how does the computer know what pizza parlor
|
||||
the human being went to???
|
||||
|
||||
:ref:`Back to Quick Quiz #3 <quiz_3>`
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
|
62
Documentation/admin-guide/acpi/fan_performance_states.rst
Normal file
62
Documentation/admin-guide/acpi/fan_performance_states.rst
Normal file
@@ -0,0 +1,62 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================
|
||||
ACPI Fan Performance States
|
||||
===========================
|
||||
|
||||
When the optional _FPS object is present under an ACPI device representing a
|
||||
fan (for example, PNP0C0B or INT3404), the ACPI fan driver creates additional
|
||||
"state*" attributes in the sysfs directory of the ACPI device in question.
|
||||
These attributes list properties of fan performance states.
|
||||
|
||||
For more information on _FPS refer to the ACPI specification at:
|
||||
|
||||
http://uefi.org/specifications
|
||||
|
||||
For instance, the contents of the INT3404 ACPI device sysfs directory
|
||||
may look as follows::
|
||||
|
||||
$ ls -l /sys/bus/acpi/devices/INT3404:00/
|
||||
total 0
|
||||
...
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state0
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state1
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state10
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state11
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state2
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state3
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state4
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state5
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state6
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state7
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state8
|
||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state9
|
||||
-r--r--r-- 1 root root 4096 Dec 13 01:00 status
|
||||
...
|
||||
|
||||
where each of the "state*" files represents one performance state of the fan
|
||||
and contains a colon-separated list of 5 integer numbers (fields) with the
|
||||
following interpretation::
|
||||
|
||||
control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
|
||||
|
||||
* ``control_percent``: The percent value to be used to set the fan speed to a
|
||||
specific level using the _FSL object (0-100).
|
||||
|
||||
* ``trip_point_index``: The active cooling trip point number that corresponds
|
||||
to this performance state (0-9).
|
||||
|
||||
* ``speed_rpm``: Speed of the fan in rotations per minute.
|
||||
|
||||
* ``noise_level_mdb``: Audible noise emitted by the fan in this state in
|
||||
millidecibels.
|
||||
|
||||
* ``power_mw``: Power draw of the fan in this state in milliwatts.
|
||||
|
||||
For example::
|
||||
|
||||
$cat /sys/bus/acpi/devices/INT3404:00/state1
|
||||
25:0:3200:12500:1250
|
||||
|
||||
When a given field is not populated or its value provided by the platform
|
||||
firmware is invalid, the "not-defined" string is shown instead of the value.
|
@@ -12,3 +12,4 @@ the Linux ACPI support.
|
||||
dsdt-override
|
||||
ssdt-overlays
|
||||
cppc_sysfs
|
||||
fan_performance_states
|
||||
|
@@ -1,15 +1,15 @@
|
||||
========================================
|
||||
zram: Compressed RAM based block devices
|
||||
zram: Compressed RAM-based block devices
|
||||
========================================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
The zram module creates RAM based block devices named /dev/zram<id>
|
||||
The zram module creates RAM-based block devices named /dev/zram<id>
|
||||
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
|
||||
in memory itself. These disks allow very fast I/O and compression provides
|
||||
good amounts of memory savings. Some of the usecases include /tmp storage,
|
||||
use as swap disks, various caches under /var and maybe many more :)
|
||||
good amounts of memory savings. Some of the use cases include /tmp storage,
|
||||
use as swap disks, various caches under /var and maybe many more. :)
|
||||
|
||||
Statistics for individual zram devices are exported through sysfs nodes at
|
||||
/sys/block/zram<id>/
|
||||
@@ -43,17 +43,17 @@ The list of possible return codes:
|
||||
|
||||
======== =============================================================
|
||||
-EBUSY an attempt to modify an attribute that cannot be changed once
|
||||
the device has been initialised. Please reset device first;
|
||||
the device has been initialised. Please reset device first.
|
||||
-ENOMEM zram was not able to allocate enough memory to fulfil your
|
||||
needs;
|
||||
needs.
|
||||
-EINVAL invalid input has been provided.
|
||||
======== =============================================================
|
||||
|
||||
If you use 'echo', the returned value that is changed by 'echo' utility,
|
||||
If you use 'echo', the returned value is set by the 'echo' utility,
|
||||
and, in general case, something like::
|
||||
|
||||
echo 3 > /sys/block/zram0/max_comp_streams
|
||||
if [ $? -ne 0 ];
|
||||
if [ $? -ne 0 ]; then
|
||||
handle_error
|
||||
fi
|
||||
|
||||
@@ -65,7 +65,8 @@ should suffice.
|
||||
::
|
||||
|
||||
modprobe zram num_devices=4
|
||||
This creates 4 devices: /dev/zram{0,1,2,3}
|
||||
|
||||
This creates 4 devices: /dev/zram{0,1,2,3}
|
||||
|
||||
num_devices parameter is optional and tells zram how many devices should be
|
||||
pre-created. Default: 1.
|
||||
@@ -73,12 +74,12 @@ pre-created. Default: 1.
|
||||
2) Set max number of compression streams
|
||||
========================================
|
||||
|
||||
Regardless the value passed to this attribute, ZRAM will always
|
||||
allocate multiple compression streams - one per online CPUs - thus
|
||||
Regardless of the value passed to this attribute, ZRAM will always
|
||||
allocate multiple compression streams - one per online CPU - thus
|
||||
allowing several concurrent compression operations. The number of
|
||||
allocated compression streams goes down when some of the CPUs
|
||||
become offline. There is no single-compression-stream mode anymore,
|
||||
unless you are running a UP system or has only 1 CPU online.
|
||||
unless you are running a UP system or have only 1 CPU online.
|
||||
|
||||
To find out how many streams are currently available::
|
||||
|
||||
@@ -89,7 +90,7 @@ To find out how many streams are currently available::
|
||||
|
||||
Using comp_algorithm device attribute one can see available and
|
||||
currently selected (shown in square brackets) compression algorithms,
|
||||
change selected compression algorithm (once the device is initialised
|
||||
or change the selected compression algorithm (once the device is initialised
|
||||
there is no way to change compression algorithm).
|
||||
|
||||
Examples::
|
||||
@@ -167,9 +168,9 @@ Examples::
|
||||
zram provides a control interface, which enables dynamic (on-demand) device
|
||||
addition and removal.
|
||||
|
||||
In order to add a new /dev/zramX device, perform read operation on hot_add
|
||||
attribute. This will return either new device's device id (meaning that you
|
||||
can use /dev/zram<id>) or error code.
|
||||
In order to add a new /dev/zramX device, perform a read operation on the hot_add
|
||||
attribute. This will return either the new device's device id (meaning that you
|
||||
can use /dev/zram<id>) or an error code.
|
||||
|
||||
Example::
|
||||
|
||||
@@ -186,8 +187,8 @@ execute::
|
||||
|
||||
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
||||
|
||||
A brief description of exported device attributes. For more details please
|
||||
read Documentation/ABI/testing/sysfs-block-zram.
|
||||
A brief description of exported device attributes follows. For more details
|
||||
please read Documentation/ABI/testing/sysfs-block-zram.
|
||||
|
||||
====================== ====== ===============================================
|
||||
Name access description
|
||||
@@ -245,7 +246,7 @@ whitespace:
|
||||
|
||||
File /sys/block/zram<id>/mm_stat
|
||||
|
||||
The stat file represents device's mm statistics. It consists of a single
|
||||
The mm_stat file represents the device's mm statistics. It consists of a single
|
||||
line of text and contains the following stats separated by whitespace:
|
||||
|
||||
================ =============================================================
|
||||
@@ -261,7 +262,7 @@ line of text and contains the following stats separated by whitespace:
|
||||
Unit: bytes
|
||||
mem_limit the maximum amount of memory ZRAM can use to store
|
||||
the compressed data
|
||||
mem_used_max the maximum amount of memory zram have consumed to
|
||||
mem_used_max the maximum amount of memory zram has consumed to
|
||||
store the data
|
||||
same_pages the number of same element filled pages written to this disk.
|
||||
No memory is allocated for such pages.
|
||||
@@ -271,7 +272,7 @@ line of text and contains the following stats separated by whitespace:
|
||||
|
||||
File /sys/block/zram<id>/bd_stat
|
||||
|
||||
The stat file represents device's backing device statistics. It consists of
|
||||
The bd_stat file represents a device's backing device statistics. It consists of
|
||||
a single line of text and contains the following stats separated by whitespace:
|
||||
|
||||
============== =============================================================
|
||||
@@ -316,9 +317,9 @@ To use the feature, admin should set up backing device via::
|
||||
echo /dev/sda5 > /sys/block/zramX/backing_dev
|
||||
|
||||
before disksize setting. It supports only partition at this moment.
|
||||
If admin want to use incompressible page writeback, they could do via::
|
||||
If admin wants to use incompressible page writeback, they could do via::
|
||||
|
||||
echo huge > /sys/block/zramX/write
|
||||
echo huge > /sys/block/zramX/writeback
|
||||
|
||||
To use idle page writeback, first, user need to declare zram pages
|
||||
as idle::
|
||||
@@ -326,7 +327,7 @@ as idle::
|
||||
echo all > /sys/block/zramX/idle
|
||||
|
||||
From now on, any pages on zram are idle pages. The idle mark
|
||||
will be removed until someone request access of the block.
|
||||
will be removed until someone requests access of the block.
|
||||
IOW, unless there is access request, those pages are still idle pages.
|
||||
|
||||
Admin can request writeback of those idle pages at right timing via::
|
||||
@@ -341,16 +342,16 @@ to guarantee storage health for entire product life.
|
||||
|
||||
To overcome the concern, zram supports "writeback_limit" feature.
|
||||
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
||||
any writeback. IOW, if admin want to apply writeback budget, he should
|
||||
any writeback. IOW, if admin wants to apply writeback budget, he should
|
||||
enable writeback_limit_enable via::
|
||||
|
||||
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
||||
until admin set the budget via /sys/block/zramX/writeback_limit.
|
||||
until admin sets the budget via /sys/block/zramX/writeback_limit.
|
||||
|
||||
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
||||
assigned via /sys/block/zramX/writeback_limit is meaninless.)
|
||||
assigned via /sys/block/zramX/writeback_limit is meaningless.)
|
||||
|
||||
If admin want to limit writeback as per-day 400M, he could do it
|
||||
like below::
|
||||
@@ -361,13 +362,13 @@ like below::
|
||||
/sys/block/zram0/writeback_limit.
|
||||
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
||||
|
||||
If admin want to allow further write again once the bugdet is exausted,
|
||||
If admins want to allow further write again once the bugdet is exhausted,
|
||||
he could do it like below::
|
||||
|
||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit
|
||||
|
||||
If admin want to see remaining writeback budget since he set::
|
||||
If admin wants to see remaining writeback budget since last set::
|
||||
|
||||
$ cat /sys/block/zramX/writeback_limit
|
||||
|
||||
@@ -375,12 +376,12 @@ If admin want to disable writeback limit, he could do::
|
||||
|
||||
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
The writeback_limit count will reset whenever you reset zram(e.g.,
|
||||
The writeback_limit count will reset whenever you reset zram (e.g.,
|
||||
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
||||
writeback happened until you reset the zram to allocate extra writeback
|
||||
budget in next setting is user's job.
|
||||
|
||||
If admin want to measure writeback count in a certain period, he could
|
||||
If admin wants to measure writeback count in a certain period, he could
|
||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
memory tracking
|
||||
|
218
Documentation/admin-guide/bootconfig.rst
Normal file
218
Documentation/admin-guide/bootconfig.rst
Normal file
@@ -0,0 +1,218 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
.. _bootconfig:
|
||||
|
||||
==================
|
||||
Boot Configuration
|
||||
==================
|
||||
|
||||
:Author: Masami Hiramatsu <mhiramat@kernel.org>
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The boot configuration expands the current kernel command line to support
|
||||
additional key-value data when booting the kernel in an efficient way.
|
||||
This allows administrators to pass a structured-Key config file.
|
||||
|
||||
Config File Syntax
|
||||
==================
|
||||
|
||||
The boot config syntax is a simple structured key-value. Each key consists
|
||||
of dot-connected-words, and key and value are connected by ``=``. The value
|
||||
has to be terminated by semi-colon (``;``) or newline (``\n``).
|
||||
For array value, array entries are separated by comma (``,``). ::
|
||||
|
||||
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
||||
|
||||
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
|
||||
|
||||
Each key word must contain only alphabets, numbers, dash (``-``) or underscore
|
||||
(``_``). And each value only contains printable characters or spaces except
|
||||
for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``),
|
||||
hash (``#``) and closing brace (``}``).
|
||||
|
||||
If you want to use those delimiters in a value, you can use either double-
|
||||
quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that
|
||||
you can not escape these quotes.
|
||||
|
||||
There can be a key which doesn't have value or has an empty value. Those keys
|
||||
are used for checking if the key exists or not (like a boolean).
|
||||
|
||||
Key-Value Syntax
|
||||
----------------
|
||||
|
||||
The boot config file syntax allows user to merge partially same word keys
|
||||
by brace. For example::
|
||||
|
||||
foo.bar.baz = value1
|
||||
foo.bar.qux.quux = value2
|
||||
|
||||
These can be written also in::
|
||||
|
||||
foo.bar {
|
||||
baz = value1
|
||||
qux.quux = value2
|
||||
}
|
||||
|
||||
Or more shorter, written as following::
|
||||
|
||||
foo.bar { baz = value1; qux.quux = value2 }
|
||||
|
||||
In both styles, same key words are automatically merged when parsing it
|
||||
at boot time. So you can append similar trees or key-values.
|
||||
|
||||
Same-key Values
|
||||
---------------
|
||||
|
||||
It is prohibited that two or more values or arrays share a same-key.
|
||||
For example,::
|
||||
|
||||
foo = bar, baz
|
||||
foo = qux # !ERROR! we can not re-define same key
|
||||
|
||||
If you want to append the value to existing key as an array member,
|
||||
you can use ``+=`` operator. For example::
|
||||
|
||||
foo = bar, baz
|
||||
foo += qux
|
||||
|
||||
In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``.
|
||||
|
||||
However, a sub-key and a value can not co-exist under a parent key.
|
||||
For example, following config is NOT allowed.::
|
||||
|
||||
foo = value1
|
||||
foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
|
||||
|
||||
|
||||
Comments
|
||||
--------
|
||||
|
||||
The config syntax accepts shell-script style comments. The comments starting
|
||||
with hash ("#") until newline ("\n") will be ignored.
|
||||
|
||||
::
|
||||
|
||||
# comment line
|
||||
foo = value # value is set to foo.
|
||||
bar = 1, # 1st element
|
||||
2, # 2nd element
|
||||
3 # 3rd element
|
||||
|
||||
This is parsed as below::
|
||||
|
||||
foo = value
|
||||
bar = 1, 2, 3
|
||||
|
||||
Note that you can not put a comment between value and delimiter(``,`` or
|
||||
``;``). This means following config has a syntax error ::
|
||||
|
||||
key = 1 # comment
|
||||
,2
|
||||
|
||||
|
||||
/proc/bootconfig
|
||||
================
|
||||
|
||||
/proc/bootconfig is a user-space interface of the boot config.
|
||||
Unlike /proc/cmdline, this file shows the key-value style list.
|
||||
Each key-value pair is shown in each line with following style::
|
||||
|
||||
KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...]
|
||||
|
||||
|
||||
Boot Kernel With a Boot Config
|
||||
==============================
|
||||
|
||||
Since the boot configuration file is loaded with initrd, it will be added
|
||||
to the end of the initrd (initramfs) image file with size, checksum and
|
||||
12-byte magic word as below.
|
||||
|
||||
[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
|
||||
|
||||
The Linux kernel decodes the last part of the initrd image in memory to
|
||||
get the boot configuration data.
|
||||
Because of this "piggyback" method, there is no need to change or
|
||||
update the boot loader and the kernel image itself.
|
||||
|
||||
To do this operation, Linux kernel provides "bootconfig" command under
|
||||
tools/bootconfig, which allows admin to apply or delete the config file
|
||||
to/from initrd image. You can build it by the following command::
|
||||
|
||||
# make -C tools/bootconfig
|
||||
|
||||
To add your boot config file to initrd image, run bootconfig as below
|
||||
(Old data is removed automatically if exists)::
|
||||
|
||||
# tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z
|
||||
|
||||
To remove the config from the image, you can use -d option as below::
|
||||
|
||||
# tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z
|
||||
|
||||
Then add "bootconfig" on the normal kernel command line to tell the
|
||||
kernel to look for the bootconfig at the end of the initrd file.
|
||||
|
||||
Config File Limitation
|
||||
======================
|
||||
|
||||
Currently the maximum config size size is 32KB and the total key-words (not
|
||||
key-value entries) must be under 1024 nodes.
|
||||
Note: this is not the number of entries but nodes, an entry must consume
|
||||
more than 2 nodes (a key-word and a value). So theoretically, it will be
|
||||
up to 512 key-value pairs. If keys contains 3 words in average, it can
|
||||
contain 256 key-value pairs. In most cases, the number of config items
|
||||
will be under 100 entries and smaller than 8KB, so it would be enough.
|
||||
If the node number exceeds 1024, parser returns an error even if the file
|
||||
size is smaller than 32KB.
|
||||
Anyway, since bootconfig command verifies it when appending a boot config
|
||||
to initrd image, user can notice it before boot.
|
||||
|
||||
|
||||
Bootconfig APIs
|
||||
===============
|
||||
|
||||
User can query or loop on key-value pairs, also it is possible to find
|
||||
a root (prefix) key node and find key-values under that node.
|
||||
|
||||
If you have a key string, you can query the value directly with the key
|
||||
using xbc_find_value(). If you want to know what keys exist in the boot
|
||||
config, you can use xbc_for_each_key_value() to iterate key-value pairs.
|
||||
Note that you need to use xbc_array_for_each_value() for accessing
|
||||
each array's value, e.g.::
|
||||
|
||||
vnode = NULL;
|
||||
xbc_find_value("key.word", &vnode);
|
||||
if (vnode && xbc_node_is_array(vnode))
|
||||
xbc_array_for_each_value(vnode, value) {
|
||||
printk("%s ", value);
|
||||
}
|
||||
|
||||
If you want to focus on keys which have a prefix string, you can use
|
||||
xbc_find_node() to find a node by the prefix string, and iterate
|
||||
keys under the prefix node with xbc_node_for_each_key_value().
|
||||
|
||||
But the most typical usage is to get the named value under prefix
|
||||
or get the named array under prefix as below::
|
||||
|
||||
root = xbc_find_node("key.prefix");
|
||||
value = xbc_node_find_value(root, "option", &vnode);
|
||||
...
|
||||
xbc_node_for_each_array_value(root, "array-option", value, anode) {
|
||||
...
|
||||
}
|
||||
|
||||
This accesses a value of "key.prefix.option" and an array of
|
||||
"key.prefix.array-option".
|
||||
|
||||
Locking is not needed, since after initialization, the config becomes
|
||||
read-only. All data and keys must be copied if you need to modify it.
|
||||
|
||||
|
||||
Functions and structures
|
||||
========================
|
||||
|
||||
.. kernel-doc:: include/linux/bootconfig.h
|
||||
.. kernel-doc:: lib/bootconfig.c
|
||||
|
@@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/.
|
||||
5-6. Device
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. HugeTLB
|
||||
5.8-1. HugeTLB Interface Files
|
||||
5-8. Misc
|
||||
5-8-1. perf_event
|
||||
5-N. Non-normative information
|
||||
@@ -2056,6 +2058,33 @@ RDMA Interface Files
|
||||
mlx4_0 hca_handle=1 hca_object=20
|
||||
ocrdma1 hca_handle=1 hca_object=23
|
||||
|
||||
HugeTLB
|
||||
-------
|
||||
|
||||
The HugeTLB controller allows to limit the HugeTLB usage per control group and
|
||||
enforces the controller limit during page fault.
|
||||
|
||||
HugeTLB Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
hugetlb.<hugepagesize>.current
|
||||
Show current usage for "hugepagesize" hugetlb. It exists for all
|
||||
the cgroup except root.
|
||||
|
||||
hugetlb.<hugepagesize>.max
|
||||
Set/show the hard limit of "hugepagesize" hugetlb usage.
|
||||
The default value is "max". It exists for all the cgroup except root.
|
||||
|
||||
hugetlb.<hugepagesize>.events
|
||||
A read-only flat-keyed file which exists on non-root cgroups.
|
||||
|
||||
max
|
||||
The number of allocation failure due to HugeTLB limit
|
||||
|
||||
hugetlb.<hugepagesize>.events.local
|
||||
Similar to hugetlb.<hugepagesize>.events but the fields in the file
|
||||
are local to the cgroup i.e. not hierarchical. The file modified event
|
||||
generated on this file reflects only the local events.
|
||||
|
||||
Misc
|
||||
----
|
||||
|
@@ -419,3 +419,5 @@ Version History
|
||||
rebuild errors.
|
||||
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
|
||||
pages allocated; also fix those not occuring after previous reductions
|
||||
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
|
||||
on the status line.
|
||||
|
@@ -92,6 +92,8 @@ Currently Available
|
||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||
the ordering)
|
||||
* Case-insensitive file name lookups
|
||||
* file-based encryption support (fscrypt)
|
||||
* file-based verity support (fsverity)
|
||||
|
||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||
directory hash tree having a maximum depth of two.
|
||||
|
@@ -64,6 +64,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
binderfs
|
||||
binfmt-misc
|
||||
blockdev/index
|
||||
bootconfig
|
||||
braille-console
|
||||
btmrvl
|
||||
cgroup-v1/index
|
||||
@@ -76,6 +77,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
device-mapper/index
|
||||
efi-stub
|
||||
ext4
|
||||
nfs/index
|
||||
gpio/index
|
||||
highuid
|
||||
hw_random
|
||||
|
@@ -437,6 +437,12 @@
|
||||
no delay (0).
|
||||
Format: integer
|
||||
|
||||
bootconfig [KNL]
|
||||
Extended command line options can be added to an initrd
|
||||
and this will cause the kernel to look for it.
|
||||
|
||||
See Documentation/admin-guide/bootconfig.rst
|
||||
|
||||
bert_disable [ACPI]
|
||||
Disable BERT OS support on buggy BIOSes.
|
||||
|
||||
@@ -511,7 +517,7 @@
|
||||
1 -- check protection requested by application.
|
||||
Default value is set via a kernel config option.
|
||||
Value can be changed at runtime via
|
||||
/selinux/checkreqprot.
|
||||
/sys/fs/selinux/checkreqprot.
|
||||
|
||||
cio_ignore= [S390]
|
||||
See Documentation/s390/common_io.rst for details.
|
||||
@@ -834,6 +840,18 @@
|
||||
dump out devices still on the deferred probe list after
|
||||
retrying.
|
||||
|
||||
dfltcc= [HW,S390]
|
||||
Format: { on | off | def_only | inf_only | always }
|
||||
on: s390 zlib hardware support for compression on
|
||||
level 1 and decompression (default)
|
||||
off: No s390 zlib hardware support
|
||||
def_only: s390 zlib hardware support for deflate
|
||||
only (compression on level 1)
|
||||
inf_only: s390 zlib hardware support for inflate
|
||||
only (decompression)
|
||||
always: Same as 'on' but ignores the selected compression
|
||||
level always using hardware support (used for debugging)
|
||||
|
||||
dhash_entries= [KNL]
|
||||
Set number of hash buckets for dentry cache.
|
||||
|
||||
@@ -1165,10 +1183,10 @@
|
||||
|
||||
efi= [EFI]
|
||||
Format: { "old_map", "nochunk", "noruntime", "debug",
|
||||
"nosoftreserve" }
|
||||
"nosoftreserve", "disable_early_pci_dma",
|
||||
"no_disable_early_pci_dma" }
|
||||
old_map [X86-64]: switch to the old ioremap-based EFI
|
||||
runtime services mapping. 32-bit still uses this one by
|
||||
default.
|
||||
runtime services mapping. [Needs CONFIG_X86_UV=y]
|
||||
nochunk: disable reading files in "chunks" in the EFI
|
||||
boot stub, as chunking can cause problems with some
|
||||
firmware implementations.
|
||||
@@ -1180,6 +1198,10 @@
|
||||
claim. Specify efi=nosoftreserve to disable this
|
||||
reservation and treat the memory by its base type
|
||||
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
|
||||
disable_early_pci_dma: Disable the busmaster bit on all
|
||||
PCI bridges while in the EFI boot stub
|
||||
no_disable_early_pci_dma: Leave the busmaster bit set
|
||||
on all PCI bridges while in the EFI boot stub
|
||||
|
||||
efi_no_storage_paranoia [EFI; X86]
|
||||
Using this parameter you can use more than 50% of
|
||||
@@ -1245,7 +1267,8 @@
|
||||
0 -- permissive (log only, no denials).
|
||||
1 -- enforcing (deny and log).
|
||||
Default value is 0.
|
||||
Value can be changed at runtime via /selinux/enforce.
|
||||
Value can be changed at runtime via
|
||||
/sys/fs/selinux/enforce.
|
||||
|
||||
erst_disable [ACPI]
|
||||
Disable Error Record Serialization Table (ERST)
|
||||
@@ -1933,10 +1956,32 @@
|
||||
<cpu number> begins at 0 and the maximum value is
|
||||
"number of CPUs in system - 1".
|
||||
|
||||
managed_irq
|
||||
|
||||
Isolate from being targeted by managed interrupts
|
||||
which have an interrupt mask containing isolated
|
||||
CPUs. The affinity of managed interrupts is
|
||||
handled by the kernel and cannot be changed via
|
||||
the /proc/irq/* interfaces.
|
||||
|
||||
This isolation is best effort and only effective
|
||||
if the automatically assigned interrupt mask of a
|
||||
device queue contains isolated and housekeeping
|
||||
CPUs. If housekeeping CPUs are online then such
|
||||
interrupts are directed to the housekeeping CPU
|
||||
so that IO submitted on the housekeeping CPU
|
||||
cannot disturb the isolated CPU.
|
||||
|
||||
If a queue's affinity mask contains only isolated
|
||||
CPUs then this parameter has no effect on the
|
||||
interrupt routing decision, though interrupts are
|
||||
only delivered when tasks running on those
|
||||
isolated CPUs submit IO. IO submitted on
|
||||
housekeeping CPUs has no influence on those
|
||||
queues.
|
||||
|
||||
The format of <cpu-list> is described above.
|
||||
|
||||
|
||||
|
||||
iucv= [HW,NET]
|
||||
|
||||
ivrs_ioapic [HW,X86_64]
|
||||
@@ -3978,6 +4023,19 @@
|
||||
test until boot completes in order to avoid
|
||||
interference.
|
||||
|
||||
rcuperf.kfree_rcu_test= [KNL]
|
||||
Set to measure performance of kfree_rcu() flooding.
|
||||
|
||||
rcuperf.kfree_nthreads= [KNL]
|
||||
The number of threads running loops of kfree_rcu().
|
||||
|
||||
rcuperf.kfree_alloc_num= [KNL]
|
||||
Number of allocations and frees done in an iteration.
|
||||
|
||||
rcuperf.kfree_loops= [KNL]
|
||||
Number of loops doing rcuperf.kfree_alloc_num number
|
||||
of allocations and frees.
|
||||
|
||||
rcuperf.nreaders= [KNL]
|
||||
Set number of RCU readers. The value -1 selects
|
||||
N, where N is the number of CPUs. A value
|
||||
@@ -4348,9 +4406,7 @@
|
||||
See security/selinux/Kconfig help text.
|
||||
0 -- disable.
|
||||
1 -- enable.
|
||||
Default value is set via kernel config option.
|
||||
If enabled at boot time, /selinux/disable can be used
|
||||
later to disable prior to initial policy load.
|
||||
Default value is 1.
|
||||
|
||||
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
|
||||
Format: { "0" | "1" }
|
||||
|
@@ -1,6 +1,7 @@
|
||||
===================
|
||||
NFS Fault Injection
|
||||
===================
|
||||
|
||||
Fault Injection
|
||||
===============
|
||||
Fault injection is a method for forcing errors that may not normally occur, or
|
||||
may be difficult to reproduce. Forcing these errors in a controlled environment
|
||||
can help the developer find and fix bugs before their code is shipped in a
|
15
Documentation/admin-guide/nfs/index.rst
Normal file
15
Documentation/admin-guide/nfs/index.rst
Normal file
@@ -0,0 +1,15 @@
|
||||
=============
|
||||
NFS
|
||||
=============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
nfs-client
|
||||
nfsroot
|
||||
nfs-rdma
|
||||
nfsd-admin-interfaces
|
||||
nfs-idmapper
|
||||
pnfs-block-server
|
||||
pnfs-scsi-server
|
||||
fault_injection
|
@@ -1,3 +1,6 @@
|
||||
==========
|
||||
NFS Client
|
||||
==========
|
||||
|
||||
The NFS client
|
||||
==============
|
||||
@@ -59,10 +62,11 @@ The DNS resolver
|
||||
|
||||
NFSv4 allows for one server to refer the NFS client to data that has been
|
||||
migrated onto another server by means of the special "fs_locations"
|
||||
attribute. See
|
||||
http://tools.ietf.org/html/rfc3530#section-6
|
||||
and
|
||||
http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
|
||||
attribute. See `RFC3530 Section 6: Filesystem Migration and Replication`_ and
|
||||
`Implementation Guide for Referrals in NFSv4`_.
|
||||
|
||||
.. _RFC3530 Section 6\: Filesystem Migration and Replication: http://tools.ietf.org/html/rfc3530#section-6
|
||||
.. _Implementation Guide for Referrals in NFSv4: http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
|
||||
|
||||
The fs_locations information can take the form of either an ip address and
|
||||
a path, or a DNS hostname and a path. The latter requires the NFS client to
|
||||
@@ -78,8 +82,8 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
|
||||
(2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
|
||||
(may be changed using the 'nfs.cache_getent' kernel boot parameter)
|
||||
is run, with two arguments:
|
||||
- the cache name, "dns_resolve"
|
||||
- the hostname to resolve
|
||||
- the cache name, "dns_resolve"
|
||||
- the hostname to resolve
|
||||
|
||||
(3) After looking up the corresponding ip address, the helper script
|
||||
writes the result into the rpc_pipefs pseudo-file
|
||||
@@ -94,43 +98,44 @@ Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
|
||||
script, and <ttl> is the 'time to live' of this cache entry (in
|
||||
units of seconds).
|
||||
|
||||
Note: If <ip address> is invalid, say the string "0", then a negative
|
||||
entry is created, which will cause the kernel to treat the hostname
|
||||
as having no valid DNS translation.
|
||||
.. note::
|
||||
If <ip address> is invalid, say the string "0", then a negative
|
||||
entry is created, which will cause the kernel to treat the hostname
|
||||
as having no valid DNS translation.
|
||||
|
||||
|
||||
|
||||
|
||||
A basic sample /sbin/nfs_cache_getent
|
||||
=====================================
|
||||
.. code-block:: sh
|
||||
|
||||
#!/bin/bash
|
||||
#
|
||||
ttl=600
|
||||
#
|
||||
cut=/usr/bin/cut
|
||||
getent=/usr/bin/getent
|
||||
rpc_pipefs=/var/lib/nfs/rpc_pipefs
|
||||
#
|
||||
die()
|
||||
{
|
||||
echo "Usage: $0 cache_name entry_name"
|
||||
exit 1
|
||||
}
|
||||
#!/bin/bash
|
||||
#
|
||||
ttl=600
|
||||
#
|
||||
cut=/usr/bin/cut
|
||||
getent=/usr/bin/getent
|
||||
rpc_pipefs=/var/lib/nfs/rpc_pipefs
|
||||
#
|
||||
die()
|
||||
{
|
||||
echo "Usage: $0 cache_name entry_name"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[ $# -lt 2 ] && die
|
||||
cachename="$1"
|
||||
cache_path=${rpc_pipefs}/cache/${cachename}/channel
|
||||
|
||||
case "${cachename}" in
|
||||
dns_resolve)
|
||||
name="$2"
|
||||
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
|
||||
[ -z "${result}" ] && result="0"
|
||||
;;
|
||||
*)
|
||||
die
|
||||
;;
|
||||
esac
|
||||
echo "${result} ${name} ${ttl}" >${cache_path}
|
||||
[ $# -lt 2 ] && die
|
||||
cachename="$1"
|
||||
cache_path=${rpc_pipefs}/cache/${cachename}/channel
|
||||
|
||||
case "${cachename}" in
|
||||
dns_resolve)
|
||||
name="$2"
|
||||
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
|
||||
[ -z "${result}" ] && result="0"
|
||||
;;
|
||||
*)
|
||||
die
|
||||
;;
|
||||
esac
|
||||
echo "${result} ${name} ${ttl}" >${cache_path}
|
@@ -1,7 +1,7 @@
|
||||
=============
|
||||
NFS ID Mapper
|
||||
=============
|
||||
|
||||
=========
|
||||
ID Mapper
|
||||
=========
|
||||
Id mapper is used by NFS to translate user and group ids into names, and to
|
||||
translate user and group names into ids. Part of this translation involves
|
||||
performing an upcall to userspace to request the information. There are two
|
||||
@@ -20,22 +20,24 @@ legacy rpc.idmap daemon for the id mapping. This result will be stored
|
||||
in a custom NFS idmap cache.
|
||||
|
||||
|
||||
===========
|
||||
Configuring
|
||||
===========
|
||||
|
||||
The file /etc/request-key.conf will need to be modified so /sbin/request-key can
|
||||
direct the upcall. The following line should be added:
|
||||
|
||||
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
|
||||
#====== ======= =============== =============== ===============================
|
||||
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
|
||||
``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
|
||||
``#====== ======= =============== =============== ===============================``
|
||||
``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
|
||||
|
||||
|
||||
This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
|
||||
The last parameter, 600, defines how many seconds into the future the key will
|
||||
expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
|
||||
is not specified, nfs.idmap will default to 600 seconds.
|
||||
|
||||
id mapper uses for key descriptions:
|
||||
id mapper uses for key descriptions::
|
||||
|
||||
uid: Find the UID for the given user
|
||||
gid: Find the GID for the given group
|
||||
user: Find the user name for the given UID
|
||||
@@ -45,23 +47,24 @@ You can handle any of these individually, rather than using the generic upcall
|
||||
program. If you would like to use your own program for a uid lookup then you
|
||||
would edit your request-key.conf so it look similar to this:
|
||||
|
||||
#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
|
||||
#====== ======= =============== =============== ===============================
|
||||
create id_resolver uid:* * /some/other/program %k %d 600
|
||||
create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
|
||||
``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
|
||||
``#====== ======= =============== =============== ===============================``
|
||||
``create id_resolver uid:* * /some/other/program %k %d 600``
|
||||
``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
|
||||
|
||||
|
||||
Notice that the new line was added above the line for the generic program.
|
||||
request-key will find the first matching line and corresponding program. In
|
||||
this case, /some/other/program will handle all uid lookups and
|
||||
/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
|
||||
|
||||
See <file:Documentation/security/keys/request-key.rst> for more information
|
||||
See Documentation/security/keys/request-key.rst for more information
|
||||
about the request-key function.
|
||||
|
||||
|
||||
=========
|
||||
nfs.idmap
|
||||
=========
|
||||
|
||||
nfs.idmap is designed to be called by request-key, and should not be run "by
|
||||
hand". This program takes two arguments, a serialized key and a key
|
||||
description. The serialized key is first converted into a key_serial_t, and
|
292
Documentation/admin-guide/nfs/nfs-rdma.rst
Normal file
292
Documentation/admin-guide/nfs/nfs-rdma.rst
Normal file
@@ -0,0 +1,292 @@
|
||||
===================
|
||||
Setting up NFS/RDMA
|
||||
===================
|
||||
|
||||
:Author:
|
||||
NetApp and Open Grid Computing (May 29, 2008)
|
||||
|
||||
.. warning::
|
||||
This document is probably obsolete.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
This document describes how to install and setup the Linux NFS/RDMA client
|
||||
and server software.
|
||||
|
||||
The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
|
||||
was first included in the following release, Linux 2.6.25.
|
||||
|
||||
In our testing, we have obtained excellent performance results (full 10Gbit
|
||||
wire bandwidth at minimal client CPU) under many workloads. The code passes
|
||||
the full Connectathon test suite and operates over both Infiniband and iWARP
|
||||
RDMA adapters.
|
||||
|
||||
Getting Help
|
||||
============
|
||||
|
||||
If you get stuck, you can ask questions on the
|
||||
nfs-rdma-devel@lists.sourceforge.net mailing list.
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
These instructions are a step by step guide to building a machine for
|
||||
use with NFS/RDMA.
|
||||
|
||||
- Install an RDMA device
|
||||
|
||||
Any device supported by the drivers in drivers/infiniband/hw is acceptable.
|
||||
|
||||
Testing has been performed using several Mellanox-based IB cards, the
|
||||
Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
|
||||
|
||||
- Install a Linux distribution and tools
|
||||
|
||||
The first kernel release to contain both the NFS/RDMA client and server was
|
||||
Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
|
||||
Linux kernel release should be installed.
|
||||
|
||||
The procedures described in this document have been tested with
|
||||
distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
|
||||
|
||||
- Install nfs-utils-1.1.2 or greater on the client
|
||||
|
||||
An NFS/RDMA mount point can be obtained by using the mount.nfs command in
|
||||
nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
|
||||
version with support for NFS/RDMA mounts, but for various reasons we
|
||||
recommend using nfs-utils-1.1.2 or greater). To see which version of
|
||||
mount.nfs you are using, type:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ /sbin/mount.nfs -V
|
||||
|
||||
If the version is less than 1.1.2 or the command does not exist,
|
||||
you should install the latest version of nfs-utils.
|
||||
|
||||
Download the latest package from: http://www.kernel.org/pub/linux/utils/nfs
|
||||
|
||||
Uncompress the package and follow the installation instructions.
|
||||
|
||||
If you will not need the idmapper and gssd executables (you do not need
|
||||
these to create an NFS/RDMA enabled mount command), the installation
|
||||
process can be simplified by disabling these features when running
|
||||
configure:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ ./configure --disable-gss --disable-nfsv4
|
||||
|
||||
To build nfs-utils you will need the tcp_wrappers package installed. For
|
||||
more information on this see the package's README and INSTALL files.
|
||||
|
||||
After building the nfs-utils package, there will be a mount.nfs binary in
|
||||
the utils/mount directory. This binary can be used to initiate NFS v2, v3,
|
||||
or v4 mounts. To initiate a v4 mount, the binary must be called
|
||||
mount.nfs4. The standard technique is to create a symlink called
|
||||
mount.nfs4 to mount.nfs.
|
||||
|
||||
This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
|
||||
|
||||
In this location, mount.nfs will be invoked automatically for NFS mounts
|
||||
by the system mount command.
|
||||
|
||||
.. note::
|
||||
mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
|
||||
on the NFS client machine. You do not need this specific version of
|
||||
nfs-utils on the server. Furthermore, only the mount.nfs command from
|
||||
nfs-utils-1.1.2 is needed on the client.
|
||||
|
||||
- Install a Linux kernel with NFS/RDMA
|
||||
|
||||
The NFS/RDMA client and server are both included in the mainline Linux
|
||||
kernel version 2.6.25 and later. This and other versions of the Linux
|
||||
kernel can be found at: https://www.kernel.org/pub/linux/kernel/
|
||||
|
||||
Download the sources and place them in an appropriate location.
|
||||
|
||||
- Configure the RDMA stack
|
||||
|
||||
Make sure your kernel configuration has RDMA support enabled. Under
|
||||
Device Drivers -> InfiniBand support, update the kernel configuration
|
||||
to enable InfiniBand support [NOTE: the option name is misleading. Enabling
|
||||
InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
|
||||
|
||||
Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
|
||||
iWARP adapter support (amso, cxgb3, etc.).
|
||||
|
||||
If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
|
||||
|
||||
- Configure the NFS client and server
|
||||
|
||||
Your kernel configuration must also have NFS file system support and/or
|
||||
NFS server support enabled. These and other NFS related configuration
|
||||
options can be found under File Systems -> Network File Systems.
|
||||
|
||||
- Build, install, reboot
|
||||
|
||||
The NFS/RDMA code will be enabled automatically if NFS and RDMA
|
||||
are turned on. The NFS/RDMA client and server are configured via the hidden
|
||||
SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
|
||||
value of SUNRPC_XPRT_RDMA will be:
|
||||
|
||||
#. N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
|
||||
and server will not be built
|
||||
|
||||
#. M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
|
||||
in this case the NFS/RDMA client and server will be built as modules
|
||||
|
||||
#. Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
|
||||
and server will be built into the kernel
|
||||
|
||||
Therefore, if you have followed the steps above and turned no NFS and RDMA,
|
||||
the NFS/RDMA client and server will be built.
|
||||
|
||||
Build a new kernel, install it, boot it.
|
||||
|
||||
Check RDMA and NFS Setup
|
||||
========================
|
||||
|
||||
Before configuring the NFS/RDMA software, it is a good idea to test
|
||||
your new kernel to ensure that the kernel is working correctly.
|
||||
In particular, it is a good idea to verify that the RDMA stack
|
||||
is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
|
||||
is working properly.
|
||||
|
||||
- Check RDMA Setup
|
||||
|
||||
If you built the RDMA components as modules, load them at
|
||||
this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
|
||||
card:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ modprobe ib_mthca
|
||||
$ modprobe ib_ipoib
|
||||
|
||||
If you are using InfiniBand, make sure there is a Subnet Manager (SM)
|
||||
running on the network. If your IB switch has an embedded SM, you can
|
||||
use it. Otherwise, you will need to run an SM, such as OpenSM, on one
|
||||
of your end nodes.
|
||||
|
||||
If an SM is running on your network, you should see the following:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ cat /sys/class/infiniband/driverX/ports/1/state
|
||||
4: ACTIVE
|
||||
|
||||
where driverX is mthca0, ipath5, ehca3, etc.
|
||||
|
||||
To further test the InfiniBand software stack, use IPoIB (this
|
||||
assumes you have two IB hosts named host1 and host2):
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
host1$ ip link set dev ib0 up
|
||||
host1$ ip address add dev ib0 a.b.c.x
|
||||
host2$ ip link set dev ib0 up
|
||||
host2$ ip address add dev ib0 a.b.c.y
|
||||
host1$ ping a.b.c.y
|
||||
host2$ ping a.b.c.x
|
||||
|
||||
For other device types, follow the appropriate procedures.
|
||||
|
||||
- Check NFS Setup
|
||||
|
||||
For the NFS components enabled above (client and/or server),
|
||||
test their functionality over standard Ethernet using TCP/IP or UDP/IP.
|
||||
|
||||
NFS/RDMA Setup
|
||||
==============
|
||||
|
||||
We recommend that you use two machines, one to act as the client and
|
||||
one to act as the server.
|
||||
|
||||
One time configuration:
|
||||
-----------------------
|
||||
|
||||
- On the server system, configure the /etc/exports file and start the NFS/RDMA server.
|
||||
|
||||
Exports entries with the following formats have been tested::
|
||||
|
||||
/vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
|
||||
/vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
|
||||
|
||||
The IP address(es) is(are) the client's IPoIB address for an InfiniBand
|
||||
HCA or the client's iWARP address(es) for an RNIC.
|
||||
|
||||
.. note::
|
||||
The "insecure" option must be used because the NFS/RDMA client does
|
||||
not use a reserved port.
|
||||
|
||||
Each time a machine boots:
|
||||
--------------------------
|
||||
|
||||
- Load and configure the RDMA drivers
|
||||
|
||||
For InfiniBand using a Mellanox adapter:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ modprobe ib_mthca
|
||||
$ modprobe ib_ipoib
|
||||
$ ip li set dev ib0 up
|
||||
$ ip addr add dev ib0 a.b.c.d
|
||||
|
||||
.. note::
|
||||
Please use unique addresses for the client and server!
|
||||
|
||||
- Start the NFS server
|
||||
|
||||
If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
|
||||
kernel config), load the RDMA transport module:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ modprobe svcrdma
|
||||
|
||||
Regardless of how the server was built (module or built-in), start the
|
||||
server:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ /etc/init.d/nfs start
|
||||
|
||||
or
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ service nfs start
|
||||
|
||||
Instruct the server to listen on the RDMA transport:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ echo rdma 20049 > /proc/fs/nfsd/portlist
|
||||
|
||||
- On the client system
|
||||
|
||||
If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
|
||||
kernel config), load the RDMA client module:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ modprobe xprtrdma.ko
|
||||
|
||||
Regardless of how the client was built (module or built-in), use this
|
||||
command to mount the NFS/RDMA server:
|
||||
|
||||
.. code-block:: sh
|
||||
|
||||
$ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt
|
||||
|
||||
To verify that the mount is using RDMA, run "cat /proc/mounts" and check
|
||||
the "proto" field for the given mount.
|
||||
|
||||
Congratulations! You're using NFS/RDMA!
|
@@ -1,5 +1,6 @@
|
||||
==================================
|
||||
Administrative interfaces for nfsd
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
==================================
|
||||
|
||||
Note that normally these interfaces are used only by the utilities in
|
||||
nfs-utils.
|
||||
@@ -13,18 +14,16 @@ nfsd/threads.
|
||||
Before doing that, NFSD can be told which sockets to listen on by
|
||||
writing to nfsd/portlist; that write may be:
|
||||
|
||||
- an ascii-encoded file descriptor, which should refer to a
|
||||
bound (and listening, for tcp) socket, or
|
||||
- "transportname port", where transportname is currently either
|
||||
"udp", "tcp", or "rdma".
|
||||
- an ascii-encoded file descriptor, which should refer to a
|
||||
bound (and listening, for tcp) socket, or
|
||||
- "transportname port", where transportname is currently either
|
||||
"udp", "tcp", or "rdma".
|
||||
|
||||
If nfsd is started without doing any of these, then it will create one
|
||||
udp and one tcp listener at port 2049 (see nfsd_init_socks).
|
||||
|
||||
On startup, nfsd and lockd grace periods start.
|
||||
|
||||
nfsd is shut down by a write of 0 to nfsd/threads. All locks and state
|
||||
are thrown away at that point.
|
||||
On startup, nfsd and lockd grace periods start. nfsd is shut down by a write of
|
||||
0 to nfsd/threads. All locks and state are thrown away at that point.
|
||||
|
||||
Between startup and shutdown, the number of threads may be adjusted up
|
||||
or down by additional writes to nfsd/threads or by writes to
|
||||
@@ -34,7 +33,7 @@ For more detail about files under nfsd/ and what they control, see
|
||||
fs/nfsd/nfsctl.c; most of them have detailed comments.
|
||||
|
||||
Implementation notes
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
====================
|
||||
|
||||
Note that the rpc server requires the caller to serialize addition and
|
||||
removal of listening sockets, and startup and shutdown of the server.
|
@@ -1,27 +1,34 @@
|
||||
===============================================
|
||||
Mounting the root filesystem via NFS (nfsroot)
|
||||
===============================================
|
||||
|
||||
Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
|
||||
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
|
||||
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
|
||||
Updated 2006 by Horms <horms@verge.net.au>
|
||||
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
|
||||
:Authors:
|
||||
Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
|
||||
|
||||
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
|
||||
|
||||
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
|
||||
|
||||
Updated 2006 by Horms <horms@verge.net.au>
|
||||
|
||||
Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
|
||||
|
||||
|
||||
|
||||
In order to use a diskless system, such as an X-terminal or printer server
|
||||
for example, it is necessary for the root filesystem to be present on a
|
||||
non-disk device. This may be an initramfs (see Documentation/filesystems/
|
||||
ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/admin-guide/initrd.rst) or a
|
||||
filesystem mounted via NFS. The following text describes on how to use NFS
|
||||
for the root filesystem. For the rest of this text 'client' means the
|
||||
diskless system, and 'server' means the NFS server.
|
||||
In order to use a diskless system, such as an X-terminal or printer server for
|
||||
example, it is necessary for the root filesystem to be present on a non-disk
|
||||
device. This may be an initramfs (see
|
||||
Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see
|
||||
Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The
|
||||
following text describes on how to use NFS for the root filesystem. For the rest
|
||||
of this text 'client' means the diskless system, and 'server' means the NFS
|
||||
server.
|
||||
|
||||
|
||||
|
||||
|
||||
1.) Enabling nfsroot capabilities
|
||||
-----------------------------
|
||||
Enabling nfsroot capabilities
|
||||
=============================
|
||||
|
||||
In order to use nfsroot, NFS client support needs to be selected as
|
||||
built-in during configuration. Once this has been selected, the nfsroot
|
||||
@@ -34,8 +41,8 @@ DHCP, BOOTP and RARP is safe.
|
||||
|
||||
|
||||
|
||||
2.) Kernel command line
|
||||
-------------------
|
||||
Kernel command line
|
||||
===================
|
||||
|
||||
When the kernel has been loaded by a boot loader (see below) it needs to be
|
||||
told what root fs device to use. And in the case of nfsroot, where to find
|
||||
@@ -44,19 +51,17 @@ This can be established using the following kernel command line parameters:
|
||||
|
||||
|
||||
root=/dev/nfs
|
||||
|
||||
This is necessary to enable the pseudo-NFS-device. Note that it's not a
|
||||
real device but just a synonym to tell the kernel to use NFS instead of
|
||||
a real device.
|
||||
|
||||
|
||||
nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
||||
|
||||
If the `nfsroot' parameter is NOT given on the command line,
|
||||
the default "/tftpboot/%s" will be used.
|
||||
the default ``"/tftpboot/%s"`` will be used.
|
||||
|
||||
<server-ip> Specifies the IP address of the NFS server.
|
||||
The default address is determined by the `ip' parameter
|
||||
The default address is determined by the ip parameter
|
||||
(see below). This parameter allows the use of different
|
||||
servers for IP autoconfiguration and NFS.
|
||||
|
||||
@@ -66,7 +71,8 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
||||
IP address.
|
||||
|
||||
<nfs-options> Standard NFS options. All options are separated by commas.
|
||||
The following defaults are used:
|
||||
The following defaults are used::
|
||||
|
||||
port = as given by server portmap daemon
|
||||
rsize = 4096
|
||||
wsize = 4096
|
||||
@@ -79,13 +85,11 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
|
||||
flags = hard, nointr, noposix, cto, ac
|
||||
|
||||
|
||||
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
||||
<dns0-ip>:<dns1-ip>:<ntp0-ip>
|
||||
|
||||
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:<dns0-ip>:<dns1-ip>:<ntp0-ip>
|
||||
This parameter tells the kernel how to configure IP addresses of devices
|
||||
and also how to set up the IP routing table. It was originally called
|
||||
`nfsaddrs', but now the boot-time IP configuration works independently of
|
||||
NFS, so it was renamed to `ip' and the old name remained as an alias for
|
||||
nfsaddrs, but now the boot-time IP configuration works independently of
|
||||
NFS, so it was renamed to ip and the old name remained as an alias for
|
||||
compatibility reasons.
|
||||
|
||||
If this parameter is missing from the kernel command line, all fields are
|
||||
@@ -93,17 +97,17 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
||||
this means that the kernel tries to configure everything using
|
||||
autoconfiguration.
|
||||
|
||||
The <autoconf> parameter can appear alone as the value to the `ip'
|
||||
The <autoconf> parameter can appear alone as the value to the ip
|
||||
parameter (without all the ':' characters before). If the value is
|
||||
"ip=off" or "ip=none", no autoconfiguration will take place, otherwise
|
||||
autoconfiguration will take place. The most common way to use this
|
||||
is "ip=dhcp".
|
||||
|
||||
<client-ip> IP address of the client.
|
||||
|
||||
Default: Determined using autoconfiguration.
|
||||
|
||||
<server-ip> IP address of the NFS server. If RARP is used to determine
|
||||
<server-ip> IP address of the NFS server.
|
||||
If RARP is used to determine
|
||||
the client address and this parameter is NOT empty only
|
||||
replies from the specified server are accepted.
|
||||
|
||||
@@ -115,19 +119,19 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
||||
(see below).
|
||||
|
||||
Default: Determined using autoconfiguration.
|
||||
The address of the autoconfiguration server is used.
|
||||
The address of the autoconfiguration server is used.
|
||||
|
||||
<gw-ip> IP address of a gateway if the server is on a different subnet.
|
||||
|
||||
Default: Determined using autoconfiguration.
|
||||
|
||||
<netmask> Netmask for local network interface. If unspecified
|
||||
the netmask is derived from the client IP address assuming
|
||||
classful addressing.
|
||||
<netmask> Netmask for local network interface.
|
||||
If unspecified the netmask is derived from the client IP address
|
||||
assuming classful addressing.
|
||||
|
||||
Default: Determined using autoconfiguration.
|
||||
|
||||
<hostname> Name of the client. If a '.' character is present, anything
|
||||
<hostname> Name of the client.
|
||||
If a '.' character is present, anything
|
||||
before the first '.' is used as the client's hostname, and anything
|
||||
after it is used as its NIS domain name. May be supplied by
|
||||
autoconfiguration, but its absence will not trigger autoconfiguration.
|
||||
@@ -138,21 +142,21 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
||||
Default: Client IP address is used in ASCII notation.
|
||||
|
||||
<device> Name of network device to use.
|
||||
|
||||
Default: If the host only has one device, it is used.
|
||||
Otherwise the device is determined using
|
||||
autoconfiguration. This is done by sending
|
||||
autoconfiguration requests out of all devices,
|
||||
and using the device that received the first reply.
|
||||
Otherwise the device is determined using
|
||||
autoconfiguration. This is done by sending
|
||||
autoconfiguration requests out of all devices,
|
||||
and using the device that received the first reply.
|
||||
|
||||
<autoconf> Method to use for autoconfiguration. In the case of options
|
||||
which specify multiple autoconfiguration protocols,
|
||||
<autoconf> Method to use for autoconfiguration.
|
||||
In the case of options
|
||||
which specify multiple autoconfiguration protocols,
|
||||
requests are sent using all protocols, and the first one
|
||||
to reply is used.
|
||||
|
||||
Only autoconfiguration protocols that have been compiled
|
||||
into the kernel will be used, regardless of the value of
|
||||
this option.
|
||||
this option::
|
||||
|
||||
off or none: don't use autoconfiguration
|
||||
(do static IP assignment instead)
|
||||
@@ -221,7 +225,6 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
|
||||
|
||||
|
||||
nfsrootdebug
|
||||
|
||||
This parameter enables debugging messages to appear in the kernel
|
||||
log at boot time so that administrators can verify that the correct
|
||||
NFS mount options, server address, and root path are passed to the
|
||||
@@ -229,36 +232,32 @@ nfsrootdebug
|
||||
|
||||
|
||||
rdinit=<executable file>
|
||||
|
||||
To specify which file contains the program that starts system
|
||||
initialization, administrators can use this command line parameter.
|
||||
The default value of this parameter is "/init". If the specified
|
||||
file exists and the kernel can execute it, root filesystem related
|
||||
kernel command line parameters, including `nfsroot=', are ignored.
|
||||
kernel command line parameters, including 'nfsroot=', are ignored.
|
||||
|
||||
A description of the process of mounting the root file system can be
|
||||
found in:
|
||||
|
||||
Documentation/driver-api/early-userspace/early_userspace_support.rst
|
||||
found in Documentation/driver-api/early-userspace/early_userspace_support.rst
|
||||
|
||||
|
||||
|
||||
|
||||
3.) Boot Loader
|
||||
----------
|
||||
Boot Loader
|
||||
===========
|
||||
|
||||
To get the kernel into memory different approaches can be used.
|
||||
They depend on various facilities being available:
|
||||
|
||||
|
||||
3.1) Booting from a floppy using syslinux
|
||||
- Booting from a floppy using syslinux
|
||||
|
||||
When building kernels, an easy way to create a boot floppy that uses
|
||||
syslinux is to use the zdisk or bzdisk make targets which use zimage
|
||||
and bzimage images respectively. Both targets accept the
|
||||
FDARGS parameter which can be used to set the kernel command line.
|
||||
|
||||
e.g.
|
||||
e.g::
|
||||
|
||||
make bzdisk FDARGS="root=/dev/nfs"
|
||||
|
||||
Note that the user running this command will need to have
|
||||
@@ -267,32 +266,36 @@ They depend on various facilities being available:
|
||||
For more information on syslinux, including how to create bootdisks
|
||||
for prebuilt kernels, see http://syslinux.zytor.com/
|
||||
|
||||
N.B: Previously it was possible to write a kernel directly to
|
||||
a floppy using dd, configure the boot device using rdev, and
|
||||
boot using the resulting floppy. Linux no longer supports this
|
||||
method of booting.
|
||||
.. note::
|
||||
Previously it was possible to write a kernel directly to
|
||||
a floppy using dd, configure the boot device using rdev, and
|
||||
boot using the resulting floppy. Linux no longer supports this
|
||||
method of booting.
|
||||
|
||||
3.2) Booting from a cdrom using isolinux
|
||||
- Booting from a cdrom using isolinux
|
||||
|
||||
When building kernels, an easy way to create a bootable cdrom that
|
||||
uses isolinux is to use the isoimage target which uses a bzimage
|
||||
image. Like zdisk and bzdisk, this target accepts the FDARGS
|
||||
parameter which can be used to set the kernel command line.
|
||||
|
||||
e.g.
|
||||
e.g::
|
||||
|
||||
make isoimage FDARGS="root=/dev/nfs"
|
||||
|
||||
The resulting iso image will be arch/<ARCH>/boot/image.iso
|
||||
This can be written to a cdrom using a variety of tools including
|
||||
cdrecord.
|
||||
|
||||
e.g.
|
||||
e.g::
|
||||
|
||||
cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
|
||||
|
||||
For more information on isolinux, including how to create bootdisks
|
||||
for prebuilt kernels, see http://syslinux.zytor.com/
|
||||
|
||||
3.2) Using LILO
|
||||
- Using LILO
|
||||
|
||||
When using LILO all the necessary command line parameters may be
|
||||
specified using the 'append=' directive in the LILO configuration
|
||||
file.
|
||||
@@ -300,15 +303,19 @@ They depend on various facilities being available:
|
||||
However, to use the 'root=' directive you also need to create
|
||||
a dummy root device, which may be removed after LILO is run.
|
||||
|
||||
mknod /dev/boot255 c 0 255
|
||||
e.g::
|
||||
|
||||
mknod /dev/boot255 c 0 255
|
||||
|
||||
For information on configuring LILO, please refer to its documentation.
|
||||
|
||||
3.3) Using GRUB
|
||||
- Using GRUB
|
||||
|
||||
When using GRUB, kernel parameter are simply appended after the kernel
|
||||
specification: kernel <kernel> <parameters>
|
||||
|
||||
3.4) Using loadlin
|
||||
- Using loadlin
|
||||
|
||||
loadlin may be used to boot Linux from a DOS command prompt without
|
||||
requiring a local hard disk to mount as root. This has not been
|
||||
thoroughly tested by the authors of this document, but in general
|
||||
@@ -317,7 +324,8 @@ They depend on various facilities being available:
|
||||
|
||||
Please refer to the loadlin documentation for further information.
|
||||
|
||||
3.5) Using a boot ROM
|
||||
- Using a boot ROM
|
||||
|
||||
This is probably the most elegant way of booting a diskless client.
|
||||
With a boot ROM the kernel is loaded using the TFTP protocol. The
|
||||
authors of this document are not aware of any no commercial boot
|
||||
@@ -326,7 +334,8 @@ They depend on various facilities being available:
|
||||
etherboot, both of which are available on sunsite.unc.edu, and both
|
||||
of which contain everything you need to boot a diskless Linux client.
|
||||
|
||||
3.6) Using pxelinux
|
||||
- Using pxelinux
|
||||
|
||||
Pxelinux may be used to boot linux using the PXE boot loader
|
||||
which is present on many modern network cards.
|
||||
|
||||
@@ -342,8 +351,8 @@ They depend on various facilities being available:
|
||||
|
||||
|
||||
|
||||
4.) Credits
|
||||
-------
|
||||
Credits
|
||||
=======
|
||||
|
||||
The nfsroot code in the kernel and the RARP support have been written
|
||||
by Gero Kuhlmann <gero@gkminix.han.de>.
|
@@ -1,4 +1,6 @@
|
||||
===================================
|
||||
pNFS block layout server user guide
|
||||
===================================
|
||||
|
||||
The Linux NFS server now supports the pNFS block layout extension. In this
|
||||
case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
|
||||
@@ -22,16 +24,19 @@ If the nfsd server needs to fence a non-responding client it calls
|
||||
/sbin/nfsd-recall-failed with the first argument set to the IP address of
|
||||
the client, and the second argument set to the device node without the /dev
|
||||
prefix for the file system to be fenced. Below is an example file that shows
|
||||
how to translate the device into a serial number from SCSI EVPD 0x80:
|
||||
how to translate the device into a serial number from SCSI EVPD 0x80::
|
||||
|
||||
cat > /sbin/nfsd-recall-failed << EOF
|
||||
#!/bin/sh
|
||||
cat > /sbin/nfsd-recall-failed << EOF
|
||||
|
||||
CLIENT="$1"
|
||||
DEV="/dev/$2"
|
||||
EVPD=`sg_inq --page=0x80 ${DEV} | \
|
||||
grep "Unit serial number:" | \
|
||||
awk -F ': ' '{print $2}'`
|
||||
.. code-block:: sh
|
||||
|
||||
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
|
||||
EOF
|
||||
#!/bin/sh
|
||||
|
||||
CLIENT="$1"
|
||||
DEV="/dev/$2"
|
||||
EVPD=`sg_inq --page=0x80 ${DEV} | \
|
||||
grep "Unit serial number:" | \
|
||||
awk -F ': ' '{print $2}'`
|
||||
|
||||
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
|
||||
EOF
|
@@ -1,4 +1,5 @@
|
||||
|
||||
==================================
|
||||
pNFS SCSI layout server user guide
|
||||
==================================
|
||||
|
@@ -506,6 +506,9 @@ object corresponding to it, as follows:
|
||||
``disable``
|
||||
Whether or not this idle state is disabled.
|
||||
|
||||
``default_status``
|
||||
The default status of this state, "enabled" or "disabled".
|
||||
|
||||
``latency``
|
||||
Exit latency of the idle state in microseconds.
|
||||
|
||||
@@ -629,16 +632,16 @@ class priority list and destroyed. If that happens, the priority list mechanism
|
||||
will be used, again, to determine the new effective value for the whole list
|
||||
and that value will become the new real constraint.
|
||||
|
||||
In turn, for each CPU there is only one resume latency PM QoS request
|
||||
associated with the :file:`power/pm_qos_resume_latency_us` file under
|
||||
In turn, for each CPU there is one resume latency PM QoS request associated with
|
||||
the :file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
|
||||
this single PM QoS request to be updated regardless of which user space
|
||||
process does that. In other words, this PM QoS request is shared by the entire
|
||||
user space, so access to the file associated with it needs to be arbitrated
|
||||
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
|
||||
practice is to pin a process to the CPU in question and let it use the
|
||||
``sysfs`` interface to control the resume latency constraint for it.] It
|
||||
still only is a request, however. It is a member of a priority list used to
|
||||
``sysfs`` interface to control the resume latency constraint for it.] It is
|
||||
still only a request, however. It is an entry in a priority list used to
|
||||
determine the effective value to be set as the resume latency constraint for the
|
||||
CPU in question every time the list of requests is updated this way or another
|
||||
(there may be other requests coming from kernel code in that list).
|
||||
|
268
Documentation/admin-guide/pm/intel_idle.rst
Normal file
268
Documentation/admin-guide/pm/intel_idle.rst
Normal file
@@ -0,0 +1,268 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
==============================================
|
||||
``intel_idle`` CPU Idle Time Management Driver
|
||||
==============================================
|
||||
|
||||
:Copyright: |copy| 2020 Intel Corporation
|
||||
|
||||
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
|
||||
General Information
|
||||
===================
|
||||
|
||||
``intel_idle`` is a part of the
|
||||
:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
|
||||
(``CPUIdle``). It is the default CPU idle time management driver for the
|
||||
Nehalem and later generations of Intel processors, but the level of support for
|
||||
a particular processor model in it depends on whether or not it recognizes that
|
||||
processor model and may also depend on information coming from the platform
|
||||
firmware. [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
|
||||
works in general, so this is the time to get familiar with :doc:`cpuidle` if you
|
||||
have not done that yet.]
|
||||
|
||||
``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
|
||||
logical CPU executing it is idle and so it may be possible to put some of the
|
||||
processor's functional blocks into low-power states. That instruction takes two
|
||||
arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
|
||||
first of which, referred to as a *hint*, can be used by the processor to
|
||||
determine what can be done (for details refer to Intel Software Developer’s
|
||||
Manual [1]_). Accordingly, ``intel_idle`` refuses to work with processors in
|
||||
which the support for the ``MWAIT`` instruction has been disabled (for example,
|
||||
via the platform firmware configuration menu) or which do not support that
|
||||
instruction at all.
|
||||
|
||||
``intel_idle`` is not modular, so it cannot be unloaded, which means that the
|
||||
only way to pass early-configuration-time parameters to it is via the kernel
|
||||
command line.
|
||||
|
||||
|
||||
.. _intel-idle-enumeration-of-states:
|
||||
|
||||
Enumeration of Idle States
|
||||
==========================
|
||||
|
||||
Each ``MWAIT`` hint value is interpreted by the processor as a license to
|
||||
reconfigure itself in a certain way in order to save energy. The processor
|
||||
configurations (with reduced power draw) resulting from that are referred to
|
||||
as C-states (in the ACPI terminology) or idle states. The list of meaningful
|
||||
``MWAIT`` hint values and idle states (i.e. low-power configurations of the
|
||||
processor) corresponding to them depends on the processor model and it may also
|
||||
depend on the configuration of the platform.
|
||||
|
||||
In order to create a list of available idle states required by the ``CPUIdle``
|
||||
subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
|
||||
``intel_idle`` can use two sources of information: static tables of idle states
|
||||
for different processor models included in the driver itself and the ACPI tables
|
||||
of the system. The former are always used if the processor model at hand is
|
||||
recognized by ``intel_idle`` and the latter are used if that is required for
|
||||
the given processor model (which is the case for all server processor models
|
||||
recognized by ``intel_idle``) or if the processor model is not recognized.
|
||||
[There is a module parameter that can be used to make the driver use the ACPI
|
||||
tables with any processor model recognized by it; see
|
||||
`below <intel-idle-parameters_>`_.]
|
||||
|
||||
If the ACPI tables are going to be used for building the list of available idle
|
||||
states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
|
||||
objects corresponding to the CPUs in the system (refer to the ACPI specification
|
||||
[2]_ for the description of ``_CST`` and its output package). Because the
|
||||
``CPUIdle`` subsystem expects that the list of idle states supplied by the
|
||||
driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
|
||||
registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
|
||||
driver looks for the first ``_CST`` object returning at least one valid idle
|
||||
state description and such that all of the idle states included in its return
|
||||
package are of the FFH (Functional Fixed Hardware) type, which means that the
|
||||
``MWAIT`` instruction is expected to be used to tell the processor that it can
|
||||
enter one of them. The return package of that ``_CST`` is then assumed to be
|
||||
applicable to all of the other CPUs in the system and the idle state
|
||||
descriptions extracted from it are stored in a preliminary list of idle states
|
||||
coming from the ACPI tables. [This step is skipped if ``intel_idle`` is
|
||||
configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
|
||||
|
||||
Next, the first (index 0) entry in the list of available idle states is
|
||||
initialized to represent a "polling idle state" (a pseudo-idle state in which
|
||||
the target CPU continuously fetches and executes instructions), and the
|
||||
subsequent (real) idle state entries are populated as follows.
|
||||
|
||||
If the processor model at hand is recognized by ``intel_idle``, there is a
|
||||
(static) table of idle state descriptions for it in the driver. In that case,
|
||||
the "internal" table is the primary source of information on idle states and the
|
||||
information from it is copied to the final list of available idle states. If
|
||||
using the ACPI tables for the enumeration of idle states is not required
|
||||
(depending on the processor model), all of the listed idle state are enabled by
|
||||
default (so all of them will be taken into consideration by ``CPUIdle``
|
||||
governors during CPU idle state selection). Otherwise, some of the listed idle
|
||||
states may not be enabled by default if there are no matching entries in the
|
||||
preliminary list of idle states coming from the ACPI tables. In that case user
|
||||
space still can enable them later (on a per-CPU basis) with the help of
|
||||
the ``disable`` idle state attribute in ``sysfs`` (see
|
||||
:ref:`idle-states-representation` in :doc:`cpuidle`). This basically means that
|
||||
the idle states "known" to the driver may not be enabled by default if they have
|
||||
not been exposed by the platform firmware (through the ACPI tables).
|
||||
|
||||
If the given processor model is not recognized by ``intel_idle``, but it
|
||||
supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
|
||||
tables is used for building the final list that will be supplied to the
|
||||
``CPUIdle`` core during driver registration. For each idle state in that list,
|
||||
the description, ``MWAIT`` hint and exit latency are copied to the corresponding
|
||||
entry in the final list of idle states. The name of the idle state represented
|
||||
by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
|
||||
"CX_ACPI", where X is the index of that idle state in the final list (note that
|
||||
the minimum value of X is 1, because 0 is reserved for the "polling" state), and
|
||||
its target residency is based on the exit latency value. Specifically, for
|
||||
C1-type idle states the exit latency value is also used as the target residency
|
||||
(for compatibility with the majority of the "internal" tables of idle states for
|
||||
various processor models recognized by ``intel_idle``) and for the other idle
|
||||
state types (C2 and C3) the target residency value is 3 times the exit latency
|
||||
(again, that is because it reflects the target residency to exit latency ratio
|
||||
in the majority of cases for the processor models recognized by ``intel_idle``).
|
||||
All of the idle states in the final list are enabled by default in this case.
|
||||
|
||||
|
||||
.. _intel-idle-initialization:
|
||||
|
||||
Initialization
|
||||
==============
|
||||
|
||||
The initialization of ``intel_idle`` starts with checking if the kernel command
|
||||
line options forbid the use of the ``MWAIT`` instruction. If that is the case,
|
||||
an error code is returned right away.
|
||||
|
||||
The next step is to check whether or not the processor model is known to the
|
||||
driver, which determines the idle states enumeration method (see
|
||||
`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
|
||||
supports ``MWAIT`` (the initialization fails if that is not the case). Then,
|
||||
the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
|
||||
driver initialization fails if the level of support is not as expected (for
|
||||
example, if the total number of ``MWAIT`` substates returned is 0).
|
||||
|
||||
Next, if the driver is not configured to ignore the ACPI tables (see
|
||||
`below <intel-idle-parameters_>`_), the idle states information provided by the
|
||||
platform firmware is extracted from them.
|
||||
|
||||
Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
|
||||
available idle states is created as explained
|
||||
`above <intel-idle-enumeration-of-states_>`_.
|
||||
|
||||
Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
|
||||
as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
|
||||
for configuring individual CPUs is registered via cpuhp_setup_state(), which
|
||||
(among other things) causes the callback routine to be invoked for all of the
|
||||
CPUs present in the system at that time (each CPU executes its own instance of
|
||||
the callback routine). That routine registers a ``CPUIdle`` device for the CPU
|
||||
running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
|
||||
optionally performs some CPU-specific initialization actions that may be
|
||||
required for the given processor model.
|
||||
|
||||
|
||||
.. _intel-idle-parameters:
|
||||
|
||||
Kernel Command Line Options and Module Parameters
|
||||
=================================================
|
||||
|
||||
The *x86* architecture support code recognizes three kernel command line
|
||||
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
|
||||
and ``idle=nomwait``. If any of them is present in the kernel command line, the
|
||||
``MWAIT`` instruction is not allowed to be used, so the initialization of
|
||||
``intel_idle`` will fail.
|
||||
|
||||
Apart from that there are four module parameters recognized by ``intel_idle``
|
||||
itself that can be set via the kernel command line (they cannot be updated via
|
||||
sysfs, so that is the only way to change their values).
|
||||
|
||||
The ``max_cstate`` parameter value is the maximum idle state index in the list
|
||||
of idle states supplied to the ``CPUIdle`` core during the registration of the
|
||||
driver. It is also the maximum number of regular (non-polling) idle states that
|
||||
can be used by ``intel_idle``, so the enumeration of idle states is terminated
|
||||
after finding that number of usable idle states (the other idle states that
|
||||
potentially might have been used if ``max_cstate`` had been greater are not
|
||||
taken into consideration at all). Setting ``max_cstate`` can prevent
|
||||
``intel_idle`` from exposing idle states that are regarded as "too deep" for
|
||||
some reason to the ``CPUIdle`` core, but it does so by making them effectively
|
||||
invisible until the system is shut down and started again which may not always
|
||||
be desirable. In practice, it is only really necessary to do that if the idle
|
||||
states in question cannot be enabled during system startup, because in the
|
||||
working state of the system the CPU power management quality of service (PM
|
||||
QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
|
||||
even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
|
||||
Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
|
||||
|
||||
The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
|
||||
if the kernel has been configured with ACPI support) can be set to make the
|
||||
driver ignore the system's ACPI tables entirely or use them for all of the
|
||||
recognized processor models, respectively (they both are unset by default and
|
||||
``use_acpi`` has no effect if ``no_acpi`` is set).
|
||||
|
||||
The value of the ``states_off`` module parameter (0 by default) represents a
|
||||
list of idle states to be disabled by default in the form of a bitmask.
|
||||
|
||||
Namely, the positions of the bits that are set in the ``states_off`` value are
|
||||
the indices of idle states to be disabled by default (as reflected by the names
|
||||
of the corresponding idle state directories in ``sysfs``, :file:`state0`,
|
||||
:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
|
||||
idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
|
||||
|
||||
For example, if ``states_off`` is equal to 3, the driver will disable idle
|
||||
states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
|
||||
disabled by default and so on (bit positions beyond the maximum idle state index
|
||||
are ignored).
|
||||
|
||||
The idle states disabled this way can be enabled (on a per-CPU basis) from user
|
||||
space via ``sysfs``.
|
||||
|
||||
|
||||
.. _intel-idle-core-and-package-idle-states:
|
||||
|
||||
Core and Package Levels of Idle States
|
||||
======================================
|
||||
|
||||
Typically, in a processor supporting the ``MWAIT`` instruction there are (at
|
||||
least) two levels of idle states (or C-states). One level, referred to as
|
||||
"core C-states", covers individual cores in the processor, whereas the other
|
||||
level, referred to as "package C-states", covers the entire processor package
|
||||
and it may also involve other components of the system (GPUs, memory
|
||||
controllers, I/O hubs etc.).
|
||||
|
||||
Some of the ``MWAIT`` hint values allow the processor to use core C-states only
|
||||
(most importantly, that is the case for the ``MWAIT`` hint value corresponding
|
||||
to the ``C1`` idle state), but the majority of them give it a license to put
|
||||
the target core (i.e. the core containing the logical CPU executing ``MWAIT``
|
||||
with the given hint value) into a specific core C-state and then (if possible)
|
||||
to enter a specific package C-state at the deeper level. For example, the
|
||||
``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
|
||||
put the target core into the low-power state referred to as "core ``C3``" (or
|
||||
``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
|
||||
have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
|
||||
representing a deeper idle state), and in addition to that (in the majority of
|
||||
cases) it gives the processor a license to put the entire package (possibly
|
||||
including some non-CPU components such as a GPU or a memory controller) into the
|
||||
low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
|
||||
all of the cores have gone into the ``CC3`` state and (possibly) some additional
|
||||
conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
|
||||
be required to be in a certain GPU-specific low-power state for ``PC3`` to be
|
||||
reachable).
|
||||
|
||||
As a rule, there is no simple way to make the processor use core C-states only
|
||||
if the conditions for entering the corresponding package C-states are met, so
|
||||
the logical CPU executing ``MWAIT`` with a hint value that is not core-level
|
||||
only (like for ``C1``) must always assume that this may cause the processor to
|
||||
enter a package C-state. [That is why the exit latency and target residency
|
||||
values corresponding to the majority of ``MWAIT`` hint values in the "internal"
|
||||
tables of idle states in ``intel_idle`` reflect the properties of package
|
||||
C-states.] If using package C-states is not desirable at all, either
|
||||
:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
|
||||
``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
|
||||
restrict the range of permissible idle states to the ones with core-level only
|
||||
``MWAIT`` hint values (like ``C1``).
|
||||
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*,
|
||||
https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
|
||||
|
||||
.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
|
||||
https://uefi.org/specifications
|
@@ -153,8 +153,11 @@ for the given CPU architecture includes the low-level code for system resume.
|
||||
Basic ``sysfs`` Interfaces for System Suspend and Hibernation
|
||||
=============================================================
|
||||
|
||||
The following files located in the :file:`/sys/power/` directory can be used by
|
||||
user space for sleep states control.
|
||||
The power management subsystem provides userspace with a unified ``sysfs``
|
||||
interface for system sleep regardless of the underlying system architecture or
|
||||
platform. That interface is located in the :file:`/sys/power/` directory
|
||||
(assuming that ``sysfs`` is mounted at :file:`/sys`) and it consists of the
|
||||
following attributes (files):
|
||||
|
||||
``state``
|
||||
This file contains a list of strings representing sleep states supported
|
||||
@@ -162,9 +165,9 @@ user space for sleep states control.
|
||||
to start a transition of the system into the sleep state represented by
|
||||
that string.
|
||||
|
||||
In particular, the strings "disk", "freeze" and "standby" represent the
|
||||
In particular, the "disk", "freeze" and "standby" strings represent the
|
||||
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
|
||||
:ref:`standby <standby>` sleep states, respectively. The string "mem"
|
||||
:ref:`standby <standby>` sleep states, respectively. The "mem" string
|
||||
is interpreted in accordance with the contents of the ``mem_sleep`` file
|
||||
described below.
|
||||
|
||||
@@ -177,7 +180,7 @@ user space for sleep states control.
|
||||
associated with the "mem" string in the ``state`` file described above.
|
||||
|
||||
The strings that may be present in this file are "s2idle", "shallow"
|
||||
and "deep". The string "s2idle" always represents :ref:`suspend-to-idle
|
||||
and "deep". The "s2idle" string always represents :ref:`suspend-to-idle
|
||||
<s2idle>` and, by convention, "shallow" and "deep" represent
|
||||
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
|
||||
respectively.
|
||||
@@ -185,15 +188,17 @@ user space for sleep states control.
|
||||
Writing one of the listed strings into this file causes the system
|
||||
suspend variant represented by it to be associated with the "mem" string
|
||||
in the ``state`` file. The string representing the suspend variant
|
||||
currently associated with the "mem" string in the ``state`` file
|
||||
is listed in square brackets.
|
||||
currently associated with the "mem" string in the ``state`` file is
|
||||
shown in square brackets.
|
||||
|
||||
If the kernel does not support system suspend, this file is not present.
|
||||
|
||||
``disk``
|
||||
This file contains a list of strings representing different operations
|
||||
that can be carried out after the hibernation image has been saved. The
|
||||
possible options are as follows:
|
||||
This file controls the operating mode of hibernation (Suspend-to-Disk).
|
||||
Specifically, it tells the kernel what to do after creating a
|
||||
hibernation image.
|
||||
|
||||
Reading from it returns a list of supported options encoded as:
|
||||
|
||||
``platform``
|
||||
Put the system into a special low-power state (e.g. ACPI S4) to
|
||||
@@ -201,6 +206,11 @@ user space for sleep states control.
|
||||
platform firmware to take a simplified initialization path after
|
||||
wakeup.
|
||||
|
||||
It is only available if the platform provides a special
|
||||
mechanism to put the system to sleep after creating a
|
||||
hibernation image (platforms with ACPI do that as a rule, for
|
||||
example).
|
||||
|
||||
``shutdown``
|
||||
Power off the system.
|
||||
|
||||
@@ -214,22 +224,53 @@ user space for sleep states control.
|
||||
the hibernation image and continue. Otherwise, use the image
|
||||
to restore the previous state of the system.
|
||||
|
||||
It is available if system suspend is supported.
|
||||
|
||||
``test_resume``
|
||||
Diagnostic operation. Load the image as though the system had
|
||||
just woken up from hibernation and the currently running kernel
|
||||
instance was a restore kernel and follow up with full system
|
||||
resume.
|
||||
|
||||
Writing one of the listed strings into this file causes the option
|
||||
Writing one of the strings listed above into this file causes the option
|
||||
represented by it to be selected.
|
||||
|
||||
The currently selected option is shown in square brackets which means
|
||||
The currently selected option is shown in square brackets, which means
|
||||
that the operation represented by it will be carried out after creating
|
||||
and saving the image next time hibernation is triggered by writing
|
||||
``disk`` to :file:`/sys/power/state`.
|
||||
and saving the image when hibernation is triggered by writing ``disk``
|
||||
to :file:`/sys/power/state`.
|
||||
|
||||
If the kernel does not support hibernation, this file is not present.
|
||||
|
||||
``image_size``
|
||||
This file controls the size of hibernation images.
|
||||
|
||||
It can be written a string representing a non-negative integer that will
|
||||
be used as a best-effort upper limit of the image size, in bytes. The
|
||||
hibernation core will do its best to ensure that the image size will not
|
||||
exceed that number, but if that turns out to be impossible to achieve, a
|
||||
hibernation image will still be created and its size will be as small as
|
||||
possible. In particular, writing '0' to this file causes the size of
|
||||
hibernation images to be minimum.
|
||||
|
||||
Reading from it returns the current image size limit, which is set to
|
||||
around 2/5 of the available RAM size by default.
|
||||
|
||||
``pm_trace``
|
||||
This file controls the "PM trace" mechanism saving the last suspend
|
||||
or resume event point in the RTC memory across reboots. It helps to
|
||||
debug hard lockups or reboots due to device driver failures that occur
|
||||
during system suspend or resume (which is more common) more effectively.
|
||||
|
||||
If it contains "1", the fingerprint of each suspend/resume event point
|
||||
in turn will be stored in the RTC memory (overwriting the actual RTC
|
||||
information), so it will survive a system crash if one occurs right
|
||||
after storing it and it can be used later to identify the driver that
|
||||
caused the crash to happen.
|
||||
|
||||
It contains "0" by default, which may be changed to "1" by writing a
|
||||
string representing a nonzero integer into it.
|
||||
|
||||
According to the above, there are two ways to make the system go into the
|
||||
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
|
||||
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
|
||||
@@ -244,6 +285,7 @@ system go into the :ref:`suspend-to-RAM <s2ram>` state (write "deep" into
|
||||
The default suspend variant (ie. the one to be used without writing anything
|
||||
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
|
||||
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
|
||||
by the value of the "mem_sleep_default" parameter in the kernel command line.
|
||||
On some ACPI-based systems, depending on the information in the ACPI tables, the
|
||||
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported.
|
||||
by the value of the ``mem_sleep_default`` parameter in the kernel command line.
|
||||
On some systems with ACPI, depending on the information in the ACPI tables, the
|
||||
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported in
|
||||
principle.
|
||||
|
@@ -8,6 +8,7 @@ Working-State Power Management
|
||||
:maxdepth: 2
|
||||
|
||||
cpuidle
|
||||
intel_idle
|
||||
cpufreq
|
||||
intel_pstate
|
||||
intel_epb
|
||||
|
@@ -1,6 +1,28 @@
|
||||
=============
|
||||
Thunderbolt
|
||||
=============
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
======================
|
||||
USB4 and Thunderbolt
|
||||
======================
|
||||
USB4 is the public specification based on Thunderbolt 3 protocol with
|
||||
some differences at the register level among other things. Connection
|
||||
manager is an entity running on the host router (host controller)
|
||||
responsible for enumerating routers and establishing tunnels. A
|
||||
connection manager can be implemented either in firmware or software.
|
||||
Typically PCs come with a firmware connection manager for Thunderbolt 3
|
||||
and early USB4 capable systems. Apple systems on the other hand use
|
||||
software connection manager and the later USB4 compliant devices follow
|
||||
the suit.
|
||||
|
||||
The Linux Thunderbolt driver supports both and can detect at runtime which
|
||||
connection manager implementation is to be used. To be on the safe side the
|
||||
software connection manager in Linux also advertises security level
|
||||
``user`` which means PCIe tunneling is disabled by default. The
|
||||
documentation below applies to both implementations with the exception that
|
||||
the software connection manager only supports ``user`` security level and
|
||||
is expected to be accompanied with an IOMMU based DMA protection.
|
||||
|
||||
Security levels and how to use them
|
||||
-----------------------------------
|
||||
The interface presented here is not meant for end users. Instead there
|
||||
should be a userspace tool that handles all the low-level details, keeps
|
||||
a database of the authorized devices and prompts users for new connections.
|
||||
@@ -18,8 +40,6 @@ This will authorize all devices automatically when they appear. However,
|
||||
keep in mind that this bypasses the security levels and makes the system
|
||||
vulnerable to DMA attacks.
|
||||
|
||||
Security levels and how to use them
|
||||
-----------------------------------
|
||||
Starting with Intel Falcon Ridge Thunderbolt controller there are 4
|
||||
security levels available. Intel Titan Ridge added one more security level
|
||||
(usbonly). The reason for these is the fact that the connected devices can
|
||||
|
@@ -92,6 +92,12 @@ the Microchip website: http://www.microchip.com.
|
||||
|
||||
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
|
||||
|
||||
- sam9x60
|
||||
|
||||
* Datasheet
|
||||
|
||||
http://ww1.microchip.com/downloads/en/DeviceDoc/SAM9X60-Data-Sheet-DS60001579A.pdf
|
||||
|
||||
* ARM Cortex-A5 based SoCs
|
||||
- sama5d3 family
|
||||
|
||||
|
@@ -129,7 +129,7 @@ this logic.
|
||||
|
||||
As a single binary will need to support both 48-bit and 52-bit VA
|
||||
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
|
||||
also must be sized large enought to accommodate a fixed PAGE_OFFSET.
|
||||
also must be sized large enough to accommodate a fixed PAGE_OFFSET.
|
||||
|
||||
Most code in the kernel should not need to consider the VA_BITS, for
|
||||
code that does need to know the VA size the variables are
|
||||
|
@@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending
|
||||
how the user addresses are used by the kernel:
|
||||
|
||||
1. User addresses not accessed by the kernel but used for address space
|
||||
management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use
|
||||
of valid tagged pointers in this context is always allowed.
|
||||
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
|
||||
tagged pointers in this context is allowed with the exception of
|
||||
``brk()``, ``mmap()`` and the ``new_address`` argument to
|
||||
``mremap()`` as these have the potential to alias with existing
|
||||
user addresses.
|
||||
|
||||
NOTE: This behaviour changed in v5.6 and so some earlier kernels may
|
||||
incorrectly accept valid tagged pointers for the ``brk()``,
|
||||
``mmap()`` and ``mremap()`` system calls.
|
||||
|
||||
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
|
||||
relaxation is disabled by default and the application thread needs to
|
||||
|
@@ -73,10 +73,11 @@ The new macros are prefixed with the ``SYM_`` prefix and can be divided into
|
||||
three main groups:
|
||||
|
||||
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
|
||||
standard C calling conventions, i.e. the stack contains a return address at
|
||||
the predefined place and a return from the function can happen in a
|
||||
standard way. When frame pointers are enabled, save/restore of frame
|
||||
pointer shall happen at the start/end of a function, respectively, too.
|
||||
standard C calling conventions. For example, on x86, this means that the
|
||||
stack contains a return address at the predefined place and a return from
|
||||
the function can happen in a standard way. When frame pointers are enabled,
|
||||
save/restore of frame pointer shall happen at the start/end of a function,
|
||||
respectively, too.
|
||||
|
||||
Checking tools like ``objtool`` should ensure such marked functions conform
|
||||
to these rules. The tools can also easily annotate these functions with
|
||||
|
@@ -47,7 +47,7 @@ Having a real iterator, and making biovecs immutable, has a number of
|
||||
advantages:
|
||||
|
||||
* Before, iterating over bios was very awkward when you weren't processing
|
||||
exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
|
||||
exactly one bvec at a time - for example, bio_copy_data() in block/bio.c,
|
||||
which copies the contents of one bio into another. Because the biovecs
|
||||
wouldn't necessarily be the same size, the old code was tricky convoluted -
|
||||
it had to walk two different bios at the same time, keeping both bi_idx and
|
||||
|
@@ -31,6 +31,7 @@ Core utilities
|
||||
generic-radix-tree
|
||||
memory-allocation
|
||||
mm-api
|
||||
pin_user_pages
|
||||
gfp_mask-from-fs-io
|
||||
timekeeping
|
||||
boot-time-mm
|
||||
@@ -39,6 +40,8 @@ Core utilities
|
||||
../RCU/index
|
||||
gcc-plugins
|
||||
symbol-namespaces
|
||||
padata
|
||||
ioctl
|
||||
|
||||
|
||||
Interfaces for kernel debugging
|
||||
|
253
Documentation/core-api/ioctl.rst
Normal file
253
Documentation/core-api/ioctl.rst
Normal file
@@ -0,0 +1,253 @@
|
||||
======================
|
||||
ioctl based interfaces
|
||||
======================
|
||||
|
||||
ioctl() is the most common way for applications to interface
|
||||
with device drivers. It is flexible and easily extended by adding new
|
||||
commands and can be passed through character devices, block devices as
|
||||
well as sockets and other special file descriptors.
|
||||
|
||||
However, it is also very easy to get ioctl command definitions wrong,
|
||||
and hard to fix them later without breaking existing applications,
|
||||
so this documentation tries to help developers get it right.
|
||||
|
||||
Command number definitions
|
||||
==========================
|
||||
|
||||
The command number, or request number, is the second argument passed to
|
||||
the ioctl system call. While this can be any 32-bit number that uniquely
|
||||
identifies an action for a particular driver, there are a number of
|
||||
conventions around defining them.
|
||||
|
||||
``include/uapi/asm-generic/ioctl.h`` provides four macros for defining
|
||||
ioctl commands that follow modern conventions: ``_IO``, ``_IOR``,
|
||||
``_IOW``, and ``_IOWR``. These should be used for all new commands,
|
||||
with the correct parameters:
|
||||
|
||||
_IO/_IOR/_IOW/_IOWR
|
||||
The macro name specifies how the argument will be used. It may be a
|
||||
pointer to data to be passed into the kernel (_IOW), out of the kernel
|
||||
(_IOR), or both (_IOWR). _IO can indicate either commands with no
|
||||
argument or those passing an integer value instead of a pointer.
|
||||
It is recommended to only use _IO for commands without arguments,
|
||||
and use pointers for passing data.
|
||||
|
||||
type
|
||||
An 8-bit number, often a character literal, specific to a subsystem
|
||||
or driver, and listed in :doc:`../userspace-api/ioctl/ioctl-number`
|
||||
|
||||
nr
|
||||
An 8-bit number identifying the specific command, unique for a give
|
||||
value of 'type'
|
||||
|
||||
data_type
|
||||
The name of the data type pointed to by the argument, the command number
|
||||
encodes the ``sizeof(data_type)`` value in a 13-bit or 14-bit integer,
|
||||
leading to a limit of 8191 bytes for the maximum size of the argument.
|
||||
Note: do not pass sizeof(data_type) type into _IOR/_IOW/IOWR, as that
|
||||
will lead to encoding sizeof(sizeof(data_type)), i.e. sizeof(size_t).
|
||||
_IO does not have a data_type parameter.
|
||||
|
||||
|
||||
Interface versions
|
||||
==================
|
||||
|
||||
Some subsystems use version numbers in data structures to overload
|
||||
commands with different interpretations of the argument.
|
||||
|
||||
This is generally a bad idea, since changes to existing commands tend
|
||||
to break existing applications.
|
||||
|
||||
A better approach is to add a new ioctl command with a new number. The
|
||||
old command still needs to be implemented in the kernel for compatibility,
|
||||
but this can be a wrapper around the new implementation.
|
||||
|
||||
Return code
|
||||
===========
|
||||
|
||||
ioctl commands can return negative error codes as documented in errno(3);
|
||||
these get turned into errno values in user space. On success, the return
|
||||
code should be zero. It is also possible but not recommended to return
|
||||
a positive 'long' value.
|
||||
|
||||
When the ioctl callback is called with an unknown command number, the
|
||||
handler returns either -ENOTTY or -ENOIOCTLCMD, which also results in
|
||||
-ENOTTY being returned from the system call. Some subsystems return
|
||||
-ENOSYS or -EINVAL here for historic reasons, but this is wrong.
|
||||
|
||||
Prior to Linux 5.5, compat_ioctl handlers were required to return
|
||||
-ENOIOCTLCMD in order to use the fallback conversion into native
|
||||
commands. As all subsystems are now responsible for handling compat
|
||||
mode themselves, this is no longer needed, but it may be important to
|
||||
consider when backporting bug fixes to older kernels.
|
||||
|
||||
Timestamps
|
||||
==========
|
||||
|
||||
Traditionally, timestamps and timeout values are passed as ``struct
|
||||
timespec`` or ``struct timeval``, but these are problematic because of
|
||||
incompatible definitions of these structures in user space after the
|
||||
move to 64-bit time_t.
|
||||
|
||||
The ``struct __kernel_timespec`` type can be used instead to be embedded
|
||||
in other data structures when separate second/nanosecond values are
|
||||
desired, or passed to user space directly. This is still not ideal though,
|
||||
as the structure matches neither the kernel's timespec64 nor the user
|
||||
space timespec exactly. The get_timespec64() and put_timespec64() helper
|
||||
functions can be used to ensure that the layout remains compatible with
|
||||
user space and the padding is treated correctly.
|
||||
|
||||
As it is cheap to convert seconds to nanoseconds, but the opposite
|
||||
requires an expensive 64-bit division, a simple __u64 nanosecond value
|
||||
can be simpler and more efficient.
|
||||
|
||||
Timeout values and timestamps should ideally use CLOCK_MONOTONIC time,
|
||||
as returned by ktime_get_ns() or ktime_get_ts64(). Unlike
|
||||
CLOCK_REALTIME, this makes the timestamps immune from jumping backwards
|
||||
or forwards due to leap second adjustments and clock_settime() calls.
|
||||
|
||||
ktime_get_real_ns() can be used for CLOCK_REALTIME timestamps that
|
||||
need to be persistent across a reboot or between multiple machines.
|
||||
|
||||
32-bit compat mode
|
||||
==================
|
||||
|
||||
In order to support 32-bit user space running on a 64-bit machine, each
|
||||
subsystem or driver that implements an ioctl callback handler must also
|
||||
implement the corresponding compat_ioctl handler.
|
||||
|
||||
As long as all the rules for data structures are followed, this is as
|
||||
easy as setting the .compat_ioctl pointer to a helper function such as
|
||||
compat_ptr_ioctl() or blkdev_compat_ptr_ioctl().
|
||||
|
||||
compat_ptr()
|
||||
------------
|
||||
|
||||
On the s390 architecture, 31-bit user space has ambiguous representations
|
||||
for data pointers, with the upper bit being ignored. When running such
|
||||
a process in compat mode, the compat_ptr() helper must be used to
|
||||
clear the upper bit of a compat_uptr_t and turn it into a valid 64-bit
|
||||
pointer. On other architectures, this macro only performs a cast to a
|
||||
``void __user *`` pointer.
|
||||
|
||||
In an compat_ioctl() callback, the last argument is an unsigned long,
|
||||
which can be interpreted as either a pointer or a scalar depending on
|
||||
the command. If it is a scalar, then compat_ptr() must not be used, to
|
||||
ensure that the 64-bit kernel behaves the same way as a 32-bit kernel
|
||||
for arguments with the upper bit set.
|
||||
|
||||
The compat_ptr_ioctl() helper can be used in place of a custom
|
||||
compat_ioctl file operation for drivers that only take arguments that
|
||||
are pointers to compatible data structures.
|
||||
|
||||
Structure layout
|
||||
----------------
|
||||
|
||||
Compatible data structures have the same layout on all architectures,
|
||||
avoiding all problematic members:
|
||||
|
||||
* ``long`` and ``unsigned long`` are the size of a register, so
|
||||
they can be either 32-bit or 64-bit wide and cannot be used in portable
|
||||
data structures. Fixed-length replacements are ``__s32``, ``__u32``,
|
||||
``__s64`` and ``__u64``.
|
||||
|
||||
* Pointers have the same problem, in addition to requiring the
|
||||
use of compat_ptr(). The best workaround is to use ``__u64``
|
||||
in place of pointers, which requires a cast to ``uintptr_t`` in user
|
||||
space, and the use of u64_to_user_ptr() in the kernel to convert
|
||||
it back into a user pointer.
|
||||
|
||||
* On the x86-32 (i386) architecture, the alignment of 64-bit variables
|
||||
is only 32-bit, but they are naturally aligned on most other
|
||||
architectures including x86-64. This means a structure like::
|
||||
|
||||
struct foo {
|
||||
__u32 a;
|
||||
__u64 b;
|
||||
__u32 c;
|
||||
};
|
||||
|
||||
has four bytes of padding between a and b on x86-64, plus another four
|
||||
bytes of padding at the end, but no padding on i386, and it needs a
|
||||
compat_ioctl conversion handler to translate between the two formats.
|
||||
|
||||
To avoid this problem, all structures should have their members
|
||||
naturally aligned, or explicit reserved fields added in place of the
|
||||
implicit padding. The ``pahole`` tool can be used for checking the
|
||||
alignment.
|
||||
|
||||
* On ARM OABI user space, structures are padded to multiples of 32-bit,
|
||||
making some structs incompatible with modern EABI kernels if they
|
||||
do not end on a 32-bit boundary.
|
||||
|
||||
* On the m68k architecture, struct members are not guaranteed to have an
|
||||
alignment greater than 16-bit, which is a problem when relying on
|
||||
implicit padding.
|
||||
|
||||
* Bitfields and enums generally work as one would expect them to,
|
||||
but some properties of them are implementation-defined, so it is better
|
||||
to avoid them completely in ioctl interfaces.
|
||||
|
||||
* ``char`` members can be either signed or unsigned, depending on
|
||||
the architecture, so the __u8 and __s8 types should be used for 8-bit
|
||||
integer values, though char arrays are clearer for fixed-length strings.
|
||||
|
||||
Information leaks
|
||||
=================
|
||||
|
||||
Uninitialized data must not be copied back to user space, as this can
|
||||
cause an information leak, which can be used to defeat kernel address
|
||||
space layout randomization (KASLR), helping in an attack.
|
||||
|
||||
For this reason (and for compat support) it is best to avoid any
|
||||
implicit padding in data structures. Where there is implicit padding
|
||||
in an existing structure, kernel drivers must be careful to fully
|
||||
initialize an instance of the structure before copying it to user
|
||||
space. This is usually done by calling memset() before assigning to
|
||||
individual members.
|
||||
|
||||
Subsystem abstractions
|
||||
======================
|
||||
|
||||
While some device drivers implement their own ioctl function, most
|
||||
subsystems implement the same command for multiple drivers. Ideally the
|
||||
subsystem has an .ioctl() handler that copies the arguments from and
|
||||
to user space, passing them into subsystem specific callback functions
|
||||
through normal kernel pointers.
|
||||
|
||||
This helps in various ways:
|
||||
|
||||
* Applications written for one driver are more likely to work for
|
||||
another one in the same subsystem if there are no subtle differences
|
||||
in the user space ABI.
|
||||
|
||||
* The complexity of user space access and data structure layout is done
|
||||
in one place, reducing the potential for implementation bugs.
|
||||
|
||||
* It is more likely to be reviewed by experienced developers
|
||||
that can spot problems in the interface when the ioctl is shared
|
||||
between multiple drivers than when it is only used in a single driver.
|
||||
|
||||
Alternatives to ioctl
|
||||
=====================
|
||||
|
||||
There are many cases in which ioctl is not the best solution for a
|
||||
problem. Alternatives include:
|
||||
|
||||
* System calls are a better choice for a system-wide feature that
|
||||
is not tied to a physical device or constrained by the file system
|
||||
permissions of a character device node
|
||||
|
||||
* netlink is the preferred way of configuring any network related
|
||||
objects through sockets.
|
||||
|
||||
* debugfs is used for ad-hoc interfaces for debugging functionality
|
||||
that does not need to be exposed as a stable interface to applications.
|
||||
|
||||
* sysfs is a good way to expose the state of an in-kernel object
|
||||
that is not tied to a file descriptor.
|
||||
|
||||
* configfs can be used for more complex configuration than sysfs
|
||||
|
||||
* A custom file system can provide extra flexibility with a simple
|
||||
user interface but adds a lot of complexity to the implementation.
|
169
Documentation/core-api/padata.rst
Normal file
169
Documentation/core-api/padata.rst
Normal file
@@ -0,0 +1,169 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================================
|
||||
The padata parallel execution mechanism
|
||||
=======================================
|
||||
|
||||
:Date: December 2019
|
||||
|
||||
Padata is a mechanism by which the kernel can farm jobs out to be done in
|
||||
parallel on multiple CPUs while retaining their ordering. It was developed for
|
||||
use with the IPsec code, which needs to be able to perform encryption and
|
||||
decryption on large numbers of packets without reordering those packets. The
|
||||
crypto developers made a point of writing padata in a sufficiently general
|
||||
fashion that it could be put to other uses as well.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Initializing
|
||||
------------
|
||||
|
||||
The first step in using padata is to set up a padata_instance structure for
|
||||
overall control of how jobs are to be run::
|
||||
|
||||
#include <linux/padata.h>
|
||||
|
||||
struct padata_instance *padata_alloc_possible(const char *name);
|
||||
|
||||
'name' simply identifies the instance.
|
||||
|
||||
There are functions for enabling and disabling the instance::
|
||||
|
||||
int padata_start(struct padata_instance *pinst);
|
||||
void padata_stop(struct padata_instance *pinst);
|
||||
|
||||
These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
|
||||
not set, other functions will refuse to work. padata_start() returns zero on
|
||||
success (flag set) or -EINVAL if the padata cpumask contains no active CPU
|
||||
(flag not set). padata_stop() clears the flag and blocks until the padata
|
||||
instance is unused.
|
||||
|
||||
Finally, complete padata initialization by allocating a padata_shell::
|
||||
|
||||
struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
|
||||
|
||||
A padata_shell is used to submit a job to padata and allows a series of such
|
||||
jobs to be serialized independently. A padata_instance may have one or more
|
||||
padata_shells associated with it, each allowing a separate series of jobs.
|
||||
|
||||
Modifying cpumasks
|
||||
------------------
|
||||
|
||||
The CPUs used to run jobs can be changed in two ways, programatically with
|
||||
padata_set_cpumask() or via sysfs. The former is defined::
|
||||
|
||||
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
|
||||
cpumask_var_t cpumask);
|
||||
|
||||
Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a
|
||||
parallel cpumask describes which processors will be used to execute jobs
|
||||
submitted to this instance in parallel and a serial cpumask defines which
|
||||
processors are allowed to be used as the serialization callback processor.
|
||||
cpumask specifies the new cpumask to use.
|
||||
|
||||
There may be sysfs files for an instance's cpumasks. For example, pcrypt's
|
||||
live in /sys/kernel/pcrypt/<instance-name>. Within an instance's directory
|
||||
there are two files, parallel_cpumask and serial_cpumask, and either cpumask
|
||||
may be changed by echoing a bitmask into the file, for example::
|
||||
|
||||
echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask
|
||||
|
||||
Reading one of these files shows the user-supplied cpumask, which may be
|
||||
different from the 'usable' cpumask.
|
||||
|
||||
Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks
|
||||
and the 'usable' cpumasks. (Each pair consists of a parallel and a serial
|
||||
cpumask.) The user-supplied cpumasks default to all possible CPUs on instance
|
||||
allocation and may be changed as above. The usable cpumasks are always a
|
||||
subset of the user-supplied cpumasks and contain only the online CPUs in the
|
||||
user-supplied masks; these are the cpumasks padata actually uses. So it is
|
||||
legal to supply a cpumask to padata that contains offline CPUs. Once an
|
||||
offline CPU in the user-supplied cpumask comes online, padata is going to use
|
||||
it.
|
||||
|
||||
Changing the CPU masks are expensive operations, so it should not be done with
|
||||
great frequency.
|
||||
|
||||
Running A Job
|
||||
-------------
|
||||
|
||||
Actually submitting work to the padata instance requires the creation of a
|
||||
padata_priv structure, which represents one job::
|
||||
|
||||
struct padata_priv {
|
||||
/* Other stuff here... */
|
||||
void (*parallel)(struct padata_priv *padata);
|
||||
void (*serial)(struct padata_priv *padata);
|
||||
};
|
||||
|
||||
This structure will almost certainly be embedded within some larger
|
||||
structure specific to the work to be done. Most of its fields are private to
|
||||
padata, but the structure should be zeroed at initialisation time, and the
|
||||
parallel() and serial() functions should be provided. Those functions will
|
||||
be called in the process of getting the work done as we will see
|
||||
momentarily.
|
||||
|
||||
The submission of the job is done with::
|
||||
|
||||
int padata_do_parallel(struct padata_shell *ps,
|
||||
struct padata_priv *padata, int *cb_cpu);
|
||||
|
||||
The ps and padata structures must be set up as described above; cb_cpu
|
||||
points to the preferred CPU to be used for the final callback when the job is
|
||||
done; it must be in the current instance's CPU mask (if not the cb_cpu pointer
|
||||
is updated to point to the CPU actually chosen). The return value from
|
||||
padata_do_parallel() is zero on success, indicating that the job is in
|
||||
progress. -EBUSY means that somebody, somewhere else is messing with the
|
||||
instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the
|
||||
serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped
|
||||
instance.
|
||||
|
||||
Each job submitted to padata_do_parallel() will, in turn, be passed to
|
||||
exactly one call to the above-mentioned parallel() function, on one CPU, so
|
||||
true parallelism is achieved by submitting multiple jobs. parallel() runs with
|
||||
software interrupts disabled and thus cannot sleep. The parallel()
|
||||
function gets the padata_priv structure pointer as its lone parameter;
|
||||
information about the actual work to be done is probably obtained by using
|
||||
container_of() to find the enclosing structure.
|
||||
|
||||
Note that parallel() has no return value; the padata subsystem assumes that
|
||||
parallel() will take responsibility for the job from this point. The job
|
||||
need not be completed during this call, but, if parallel() leaves work
|
||||
outstanding, it should be prepared to be called again with a new job before
|
||||
the previous one completes.
|
||||
|
||||
Serializing Jobs
|
||||
----------------
|
||||
|
||||
When a job does complete, parallel() (or whatever function actually finishes
|
||||
the work) should inform padata of the fact with a call to::
|
||||
|
||||
void padata_do_serial(struct padata_priv *padata);
|
||||
|
||||
At some point in the future, padata_do_serial() will trigger a call to the
|
||||
serial() function in the padata_priv structure. That call will happen on
|
||||
the CPU requested in the initial call to padata_do_parallel(); it, too, is
|
||||
run with local software interrupts disabled.
|
||||
Note that this call may be deferred for a while since the padata code takes
|
||||
pains to ensure that jobs are completed in the order in which they were
|
||||
submitted.
|
||||
|
||||
Destroying
|
||||
----------
|
||||
|
||||
Cleaning up a padata instance predictably involves calling the three free
|
||||
functions that correspond to the allocation in reverse::
|
||||
|
||||
void padata_free_shell(struct padata_shell *ps);
|
||||
void padata_stop(struct padata_instance *pinst);
|
||||
void padata_free(struct padata_instance *pinst);
|
||||
|
||||
It is the user's responsibility to ensure all outstanding jobs are complete
|
||||
before any of the above are called.
|
||||
|
||||
Interface
|
||||
=========
|
||||
|
||||
.. kernel-doc:: include/linux/padata.h
|
||||
.. kernel-doc:: kernel/padata.c
|
232
Documentation/core-api/pin_user_pages.rst
Normal file
232
Documentation/core-api/pin_user_pages.rst
Normal file
@@ -0,0 +1,232 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================================================
|
||||
pin_user_pages() and related calls
|
||||
====================================================
|
||||
|
||||
.. contents:: :local:
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
This document describes the following functions::
|
||||
|
||||
pin_user_pages()
|
||||
pin_user_pages_fast()
|
||||
pin_user_pages_remote()
|
||||
|
||||
Basic description of FOLL_PIN
|
||||
=============================
|
||||
|
||||
FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*()
|
||||
("gup") family of functions. FOLL_PIN has significant interactions and
|
||||
interdependencies with FOLL_LONGTERM, so both are covered here.
|
||||
|
||||
FOLL_PIN is internal to gup, meaning that it should not appear at the gup call
|
||||
sites. This allows the associated wrapper functions (pin_user_pages*() and
|
||||
others) to set the correct combination of these flags, and to check for problems
|
||||
as well.
|
||||
|
||||
FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites.
|
||||
This is in order to avoid creating a large number of wrapper functions to cover
|
||||
all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
|
||||
pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
|
||||
that's a natural dividing line, and a good point to make separate wrapper calls.
|
||||
In other words, use pin_user_pages*() for DMA-pinned pages, and
|
||||
get_user_pages*() for other cases. There are four cases described later on in
|
||||
this document, to further clarify that concept.
|
||||
|
||||
FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
|
||||
multiple threads and call sites are free to pin the same struct pages, via both
|
||||
FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the
|
||||
other, not the struct page(s).
|
||||
|
||||
The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN
|
||||
uses a different reference counting technique.
|
||||
|
||||
FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is,
|
||||
FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.
|
||||
|
||||
Which flags are set by each wrapper
|
||||
===================================
|
||||
|
||||
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
|
||||
flags the caller provides. The caller is required to pass in a non-null struct
|
||||
pages* array, and the function then pin pages by incrementing each by a special
|
||||
value. For now, that value is +1, just like get_user_pages*().::
|
||||
|
||||
Function
|
||||
--------
|
||||
pin_user_pages FOLL_PIN is always set internally by this function.
|
||||
pin_user_pages_fast FOLL_PIN is always set internally by this function.
|
||||
pin_user_pages_remote FOLL_PIN is always set internally by this function.
|
||||
|
||||
For these get_user_pages*() functions, FOLL_GET might not even be specified.
|
||||
Behavior is a little more complex than above. If FOLL_GET was *not* specified,
|
||||
but the caller passed in a non-null struct pages* array, then the function
|
||||
sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount
|
||||
of each page by +1.::
|
||||
|
||||
Function
|
||||
--------
|
||||
get_user_pages FOLL_GET is sometimes set internally by this function.
|
||||
get_user_pages_fast FOLL_GET is sometimes set internally by this function.
|
||||
get_user_pages_remote FOLL_GET is sometimes set internally by this function.
|
||||
|
||||
Tracking dma-pinned pages
|
||||
=========================
|
||||
|
||||
Some of the key design constraints, and solutions, for tracking dma-pinned
|
||||
pages:
|
||||
|
||||
* An actual reference count, per struct page, is required. This is because
|
||||
multiple processes may pin and unpin a page.
|
||||
|
||||
* False positives (reporting that a page is dma-pinned, when in fact it is not)
|
||||
are acceptable, but false negatives are not.
|
||||
|
||||
* struct page may not be increased in size for this, and all fields are already
|
||||
used.
|
||||
|
||||
* Given the above, we can overload the page->_refcount field by using, sort of,
|
||||
the upper bits in that field for a dma-pinned count. "Sort of", means that,
|
||||
rather than dividing page->_refcount into bit fields, we simple add a medium-
|
||||
large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to
|
||||
page->_refcount. This provides fuzzy behavior: if a page has get_page() called
|
||||
on it 1024 times, then it will appear to have a single dma-pinned count.
|
||||
And again, that's acceptable.
|
||||
|
||||
This also leads to limitations: there are only 31-10==21 bits available for a
|
||||
counter that increments 10 bits at a time.
|
||||
|
||||
TODO: for 1GB and larger huge pages, this is cutting it close. That's because
|
||||
when pin_user_pages() follows such pages, it increments the head page by "1"
|
||||
(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
|
||||
pin_user_pages()) for each tail page. So if you have a 1GB huge page:
|
||||
|
||||
* There are 256K (18 bits) worth of 4 KB tail pages.
|
||||
* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
|
||||
10 bits at a time)
|
||||
* There are 21 - 18 == 3 bits available to count. Except that there aren't,
|
||||
because you need to allow for a few normal get_page() calls on the head page,
|
||||
as well. Fortunately, the approach of using addition, rather than "hard"
|
||||
bitfields, within page->_refcount, allows for sharing these bits gracefully.
|
||||
But we're still looking at about 8 references.
|
||||
|
||||
This, however, is a missing feature more than anything else, because it's easily
|
||||
solved by addressing an obvious inefficiency in the original get_user_pages()
|
||||
approach of retrieving pages: stop treating all the pages as if they were
|
||||
PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
|
||||
this, so some work is required. Once that's in place, this limitation mostly
|
||||
disappears from view, because there will be ample refcounting range available.
|
||||
|
||||
* Callers must specifically request "dma-pinned tracking of pages". In other
|
||||
words, just calling get_user_pages() will not suffice; a new set of functions,
|
||||
pin_user_page() and related, must be used.
|
||||
|
||||
FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags
|
||||
==========================================================
|
||||
|
||||
Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing
|
||||
these categories:
|
||||
|
||||
CASE 1: Direct IO (DIO)
|
||||
-----------------------
|
||||
There are GUP references to pages that are serving
|
||||
as DIO buffers. These buffers are needed for a relatively short time (so they
|
||||
are not "long term"). No special synchronization with page_mkclean() or
|
||||
munmap() is provided. Therefore, flags to set at the call site are: ::
|
||||
|
||||
FOLL_PIN
|
||||
|
||||
...but rather than setting FOLL_PIN directly, call sites should use one of
|
||||
the pin_user_pages*() routines that set FOLL_PIN.
|
||||
|
||||
CASE 2: RDMA
|
||||
------------
|
||||
There are GUP references to pages that are serving as DMA
|
||||
buffers. These buffers are needed for a long time ("long term"). No special
|
||||
synchronization with page_mkclean() or munmap() is provided. Therefore, flags
|
||||
to set at the call site are: ::
|
||||
|
||||
FOLL_PIN | FOLL_LONGTERM
|
||||
|
||||
NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's
|
||||
because DAX pages do not have a separate page cache, and so "pinning" implies
|
||||
locking down file system blocks, which is not (yet) supported in that way.
|
||||
|
||||
CASE 3: Hardware with page faulting support
|
||||
-------------------------------------------
|
||||
Here, a well-written driver doesn't normally need to pin pages at all. However,
|
||||
if the driver does choose to do so, it can register MMU notifiers for the range,
|
||||
and will be called back upon invalidation. Either way (avoiding page pinning, or
|
||||
using MMU notifiers to unpin upon request), there is proper synchronization with
|
||||
both filesystem and mm (page_mkclean(), munmap(), etc).
|
||||
|
||||
Therefore, neither flag needs to be set.
|
||||
|
||||
In this case, ideally, neither get_user_pages() nor pin_user_pages() should be
|
||||
called. Instead, the software should be written so that it does not pin pages.
|
||||
This allows mm and filesystems to operate more efficiently and reliably.
|
||||
|
||||
CASE 4: Pinning for struct page manipulation only
|
||||
-------------------------------------------------
|
||||
Here, normal GUP calls are sufficient, so neither flag needs to be set.
|
||||
|
||||
page_dma_pinned(): the whole point of pinning
|
||||
=============================================
|
||||
|
||||
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
|
||||
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
|
||||
(and file system writeback code in general) to make informed decisions about
|
||||
what to do when a page cannot be unmapped due to such pins.
|
||||
|
||||
What to do in those cases is the subject of a years-long series of discussions
|
||||
and debates (see the References at the end of this document). It's a TODO item
|
||||
here: fill in the details once that's worked out. Meanwhile, it's safe to say
|
||||
that having this available: ::
|
||||
|
||||
static inline bool page_dma_pinned(struct page *page)
|
||||
|
||||
...is a prerequisite to solving the long-running gup+DMA problem.
|
||||
|
||||
Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM
|
||||
===================================================================
|
||||
|
||||
Another way of thinking about these flags is as a progression of restrictions:
|
||||
FOLL_GET is for struct page manipulation, without affecting the data that the
|
||||
struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for
|
||||
short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is
|
||||
a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more
|
||||
restrictive case that has FOLL_PIN as a prerequisite: this is for pages that
|
||||
will be pinned longterm, and whose data will be accessed.
|
||||
|
||||
Unit testing
|
||||
============
|
||||
This file::
|
||||
|
||||
tools/testing/selftests/vm/gup_benchmark.c
|
||||
|
||||
has the following new calls to exercise the new pin*() wrapper functions:
|
||||
|
||||
* PIN_FAST_BENCHMARK (./gup_benchmark -a)
|
||||
* PIN_BENCHMARK (./gup_benchmark -b)
|
||||
|
||||
You can monitor how many total dma-pinned pages have been acquired and released
|
||||
since the system was booted, via two new /proc/vmstat entries: ::
|
||||
|
||||
/proc/vmstat/nr_foll_pin_requested
|
||||
/proc/vmstat/nr_foll_pin_requested
|
||||
|
||||
Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
|
||||
because there is a noticeable performance drop in unpin_user_page(), when they
|
||||
are activated.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
|
||||
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
|
||||
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
|
||||
|
||||
John Hubbard, October, 2019
|
@@ -31,33 +31,23 @@ The counterparts to those functions are listed below.
|
||||
|
||||
::
|
||||
|
||||
int crypto_unregister_alg(struct crypto_alg *alg);
|
||||
int crypto_unregister_algs(struct crypto_alg *algs, int count);
|
||||
void crypto_unregister_alg(struct crypto_alg *alg);
|
||||
void crypto_unregister_algs(struct crypto_alg *algs, int count);
|
||||
|
||||
|
||||
Notice that both registration and unregistration functions do return a
|
||||
value, so make sure to handle errors. A return code of zero implies
|
||||
success. Any return code < 0 implies an error.
|
||||
The registration functions return 0 on success, or a negative errno
|
||||
value on failure. crypto_register_algs() succeeds only if it
|
||||
successfully registered all the given algorithms; if it fails partway
|
||||
through, then any changes are rolled back.
|
||||
|
||||
The bulk registration/unregistration functions register/unregister each
|
||||
transformation in the given array of length count. They handle errors as
|
||||
follows:
|
||||
|
||||
- crypto_register_algs() succeeds if and only if it successfully
|
||||
registers all the given transformations. If an error occurs partway
|
||||
through, then it rolls back successful registrations before returning
|
||||
the error code. Note that if a driver needs to handle registration
|
||||
errors for individual transformations, then it will need to use the
|
||||
non-bulk function crypto_register_alg() instead.
|
||||
|
||||
- crypto_unregister_algs() tries to unregister all the given
|
||||
transformations, continuing on error. It logs errors and always
|
||||
returns zero.
|
||||
The unregistration functions always succeed, so they don't have a
|
||||
return value. Don't try to unregister algorithms that aren't
|
||||
currently registered.
|
||||
|
||||
Single-Block Symmetric Ciphers [CIPHER]
|
||||
---------------------------------------
|
||||
|
||||
Example of transformations: aes, arc4, ...
|
||||
Example of transformations: aes, serpent, ...
|
||||
|
||||
This section describes the simplest of all transformation
|
||||
implementations, that being the CIPHER type used for symmetric ciphers.
|
||||
@@ -108,7 +98,7 @@ is also valid:
|
||||
Multi-Block Ciphers
|
||||
-------------------
|
||||
|
||||
Example of transformations: cbc(aes), ecb(arc4), ...
|
||||
Example of transformations: cbc(aes), chacha20, ...
|
||||
|
||||
This section describes the multi-block cipher transformation
|
||||
implementations. The multi-block ciphers are used for transformations
|
||||
@@ -169,10 +159,10 @@ are as follows:
|
||||
|
||||
::
|
||||
|
||||
int crypto_unregister_ahash(struct ahash_alg *alg);
|
||||
void crypto_unregister_ahash(struct ahash_alg *alg);
|
||||
|
||||
int crypto_unregister_shash(struct shash_alg *alg);
|
||||
int crypto_unregister_shashes(struct shash_alg *algs, int count);
|
||||
void crypto_unregister_shash(struct shash_alg *alg);
|
||||
void crypto_unregister_shashes(struct shash_alg *algs, int count);
|
||||
|
||||
|
||||
Cipher Definition With struct shash_alg and ahash_alg
|
||||
|
@@ -21,8 +21,8 @@ global variables yet.
|
||||
|
||||
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
|
||||
|
||||
Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390
|
||||
architectures, and tag-based KASAN is supported only for arm64.
|
||||
Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and
|
||||
riscv architectures, and tag-based KASAN is supported only for arm64.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
@@ -29,7 +29,8 @@ Yes, well, mostly.
|
||||
|
||||
For the most part, the KUnit core framework (what you use to write the tests)
|
||||
can compile to any architecture; it compiles like just another part of the
|
||||
kernel and runs when the kernel boots. However, there is some infrastructure,
|
||||
kernel and runs when the kernel boots, or when built as a module, when the
|
||||
module is loaded. However, there is some infrastructure,
|
||||
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
|
||||
other architectures.
|
||||
|
||||
|
@@ -49,6 +49,9 @@ to a standalone program that can be run like any other program directly inside
|
||||
of a host operating system; to be clear, it does not require any virtualization
|
||||
support; it is just a regular program.
|
||||
|
||||
Alternatively, kunit and kunit tests can be built as modules and tests will
|
||||
run when the test module is loaded.
|
||||
|
||||
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
||||
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
||||
deal to some people, but having such fast and easy to run tests fundamentally
|
||||
|
@@ -539,6 +539,23 @@ Interspersed in the kernel logs you might see the following:
|
||||
|
||||
Congratulations, you just ran a KUnit test on the x86 architecture!
|
||||
|
||||
In a similar manner, kunit and kunit tests can also be built as modules,
|
||||
so if you wanted to run tests in this way you might add the following config
|
||||
options to your ``.config``:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
CONFIG_KUNIT=m
|
||||
CONFIG_KUNIT_EXAMPLE_TEST=m
|
||||
|
||||
Once the kernel is built and installed, a simple
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
modprobe example-test
|
||||
|
||||
...will run the tests.
|
||||
|
||||
Writing new tests for other architectures
|
||||
-----------------------------------------
|
||||
|
||||
|
@@ -59,6 +59,7 @@ properties:
|
||||
- friendlyarm,nanopi-k2
|
||||
- hardkernel,odroid-c2
|
||||
- nexbox,a95x
|
||||
- videostrong,kii-pro
|
||||
- wetek,hub
|
||||
- wetek,play2
|
||||
- const: amlogic,meson-gxbb
|
||||
@@ -104,6 +105,7 @@ properties:
|
||||
- enum:
|
||||
- amlogic,p230
|
||||
- amlogic,p231
|
||||
- libretech,aml-s905d-pc
|
||||
- phicomm,n1
|
||||
- const: amlogic,s905d
|
||||
- const: amlogic,meson-gxl
|
||||
@@ -115,6 +117,7 @@ properties:
|
||||
- amlogic,q201
|
||||
- khadas,vim2
|
||||
- kingnovel,r-box-pro
|
||||
- libretech,aml-s912-pc
|
||||
- nexbox,a1
|
||||
- tronsmart,vega-s96
|
||||
- const: amlogic,s912
|
||||
|
@@ -121,7 +121,7 @@ Required properties (in root node):
|
||||
Required nodes:
|
||||
|
||||
- soc: some node of the RealView platforms must be the SoC
|
||||
node that contain the SoC-specific devices, withe the compatible
|
||||
node that contain the SoC-specific devices, with the compatible
|
||||
string set to one of these tuples:
|
||||
"arm,realview-eb-soc", "simple-bus"
|
||||
"arm,realview-pb1176-soc", "simple-bus"
|
||||
|
@@ -35,6 +35,16 @@ properties:
|
||||
- atmel,at91sam9x60
|
||||
- const: atmel,at91sam9
|
||||
|
||||
- items:
|
||||
- enum:
|
||||
- overkiz,kizboxmini-base # Overkiz kizbox Mini Base Board
|
||||
- overkiz,kizboxmini-mb # Overkiz kizbox Mini Mother Board
|
||||
- overkiz,kizboxmini-rd # Overkiz kizbox Mini RailDIN
|
||||
- overkiz,smartkiz # Overkiz SmartKiz Board
|
||||
- const: atmel,at91sam9g25
|
||||
- const: atmel,at91sam9x5
|
||||
- const: atmel,at91sam9
|
||||
|
||||
- items:
|
||||
- enum:
|
||||
- atmel,at91sam9g15
|
||||
@@ -52,11 +62,32 @@ properties:
|
||||
- const: atmel,sama5d2
|
||||
- const: atmel,sama5
|
||||
|
||||
- description: Microchip SAMA5D27 WLSOM1
|
||||
items:
|
||||
- const: microchip,sama5d27-wlsom1
|
||||
- const: atmel,sama5d27
|
||||
- const: atmel,sama5d2
|
||||
- const: atmel,sama5
|
||||
|
||||
- description: Microchip SAMA5D27 WLSOM1 Evaluation Kit
|
||||
items:
|
||||
- const: microchip,sama5d27-wlsom1-ek
|
||||
- const: microchip,sama5d27-wlsom1
|
||||
- const: atmel,sama5d27
|
||||
- const: atmel,sama5d2
|
||||
- const: atmel,sama5
|
||||
|
||||
- items:
|
||||
- const: atmel,sama5d27
|
||||
- const: atmel,sama5d2
|
||||
- const: atmel,sama5
|
||||
|
||||
- description: SAM9X60-EK board
|
||||
items:
|
||||
- const: microchip,sam9x60ek
|
||||
- const: microchip,sam9x60
|
||||
- const: atmel,at91sam9
|
||||
|
||||
- description: Nattis v2 board with Natte v2 power board
|
||||
items:
|
||||
- const: axentia,nattis-2
|
||||
|
@@ -10,6 +10,12 @@ PIT Timer required properties:
|
||||
- interrupts: Should contain interrupt for the PIT which is the IRQ line
|
||||
shared across all System Controller members.
|
||||
|
||||
PIT64B Timer required properties:
|
||||
- compatible: Should be "microchip,sam9x60-pit64b"
|
||||
- reg: Should contain registers location and length
|
||||
- interrupts: Should contain interrupt for PIT64B timer
|
||||
- clocks: Should contain the available clock sources for PIT64B timer.
|
||||
|
||||
System Timer (ST) required properties:
|
||||
- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd"
|
||||
- reg: Should contain registers location and length
|
||||
@@ -39,6 +45,7 @@ RAMC SDRAM/DDR Controller required properties:
|
||||
"atmel,at91sam9260-sdramc",
|
||||
"atmel,at91sam9g45-ddramc",
|
||||
"atmel,sama5d3-ddramc",
|
||||
"microchip,sam9x60-ddramc"
|
||||
- reg: Should contain registers location and length
|
||||
|
||||
Examples:
|
||||
|
@@ -242,6 +242,21 @@ properties:
|
||||
|
||||
where voltage is in V, frequency is in MHz.
|
||||
|
||||
power-domains:
|
||||
$ref: '/schemas/types.yaml#/definitions/phandle-array'
|
||||
description:
|
||||
List of phandles and PM domain specifiers, as defined by bindings of the
|
||||
PM domain provider (see also ../power_domain.txt).
|
||||
|
||||
power-domain-names:
|
||||
$ref: '/schemas/types.yaml#/definitions/string-array'
|
||||
description:
|
||||
A list of power domain name strings sorted in the same order as the
|
||||
power-domains property.
|
||||
|
||||
For PSCI based platforms, the name corresponding to the index of the PSCI
|
||||
PM domain provider, must be "psci".
|
||||
|
||||
qcom,saw:
|
||||
$ref: '/schemas/types.yaml#/definitions/phandle'
|
||||
description: |
|
||||
|
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/bindings/arm/fsl.yaml#
|
||||
$id: http://devicetree.org/schemas/arm/fsl.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Freescale i.MX Platforms Device Tree Bindings
|
||||
@@ -128,6 +128,27 @@ properties:
|
||||
- variscite,dt6customboard
|
||||
- const: fsl,imx6q
|
||||
|
||||
- description: i.MX6Q Gateworks Ventana Boards
|
||||
items:
|
||||
- enum:
|
||||
- gw,imx6q-gw51xx
|
||||
- gw,imx6q-gw52xx
|
||||
- gw,imx6q-gw53xx
|
||||
- gw,imx6q-gw5400-a
|
||||
- gw,imx6q-gw54xx
|
||||
- gw,imx6q-gw551x
|
||||
- gw,imx6q-gw552x
|
||||
- gw,imx6q-gw553x
|
||||
- gw,imx6q-gw560x
|
||||
- gw,imx6q-gw5903
|
||||
- gw,imx6q-gw5904
|
||||
- gw,imx6q-gw5907
|
||||
- gw,imx6q-gw5910
|
||||
- gw,imx6q-gw5912
|
||||
- gw,imx6q-gw5913
|
||||
- const: gw,ventana
|
||||
- const: fsl,imx6q
|
||||
|
||||
- description: i.MX6QP based Boards
|
||||
items:
|
||||
- enum:
|
||||
@@ -154,10 +175,31 @@ properties:
|
||||
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
|
||||
- const: fsl,imx6dl
|
||||
|
||||
- description: i.MX6DL Gateworks Ventana Boards
|
||||
items:
|
||||
- enum:
|
||||
- gw,imx6dl-gw51xx
|
||||
- gw,imx6dl-gw52xx
|
||||
- gw,imx6dl-gw53xx
|
||||
- gw,imx6dl-gw54xx
|
||||
- gw,imx6dl-gw551x
|
||||
- gw,imx6dl-gw552x
|
||||
- gw,imx6dl-gw553x
|
||||
- gw,imx6dl-gw560x
|
||||
- gw,imx6dl-gw5903
|
||||
- gw,imx6dl-gw5904
|
||||
- gw,imx6dl-gw5907
|
||||
- gw,imx6dl-gw5910
|
||||
- gw,imx6dl-gw5912
|
||||
- gw,imx6dl-gw5913
|
||||
- const: gw,ventana
|
||||
- const: fsl,imx6dl
|
||||
|
||||
- description: i.MX6SL based Boards
|
||||
items:
|
||||
- enum:
|
||||
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
|
||||
- kobo,tolino-shine3
|
||||
- const: fsl,imx6sl
|
||||
|
||||
- description: i.MX6SLL based Boards
|
||||
@@ -172,6 +214,7 @@ properties:
|
||||
- enum:
|
||||
- fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board
|
||||
- fsl,imx6sx-sdb # i.MX6 SoloX SDB Board
|
||||
- fsl,imx6sx-sdb-reva # i.MX6 SoloX SDB Rev-A Board
|
||||
- const: fsl,imx6sx
|
||||
|
||||
- description: i.MX6UL based Boards
|
||||
@@ -239,6 +282,7 @@ properties:
|
||||
items:
|
||||
- enum:
|
||||
- fsl,imx7d-sdb # i.MX7 SabreSD Board
|
||||
- fsl,imx7d-sdb-reva # i.MX7 SabreSD Rev-A Board
|
||||
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
|
||||
- toradex,colibri-imx7d # Colibri iMX7 Dual Module
|
||||
- toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
|
||||
@@ -263,6 +307,7 @@ properties:
|
||||
- description: i.MX7ULP based Boards
|
||||
items:
|
||||
- enum:
|
||||
- ea,imx7ulp-com # i.MX7ULP Embedded Artists COM Board
|
||||
- fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
|
||||
- const: fsl,imx7ulp
|
||||
|
||||
@@ -283,7 +328,9 @@ properties:
|
||||
items:
|
||||
- enum:
|
||||
- boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board
|
||||
- einfochips,imx8mq-thor96 # i.MX8MQ Thor96 Board
|
||||
- fsl,imx8mq-evk # i.MX8MQ EVK Board
|
||||
- google,imx8mq-phanbell # Google Coral Edge TPU
|
||||
- purism,librem5-devkit # Purism Librem5 devkit
|
||||
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
|
||||
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
|
||||
@@ -385,6 +432,13 @@ properties:
|
||||
- fsl,ls2088a-rdb
|
||||
- const: fsl,ls2088a
|
||||
|
||||
- description: LX2160A based Boards
|
||||
items:
|
||||
- enum:
|
||||
- fsl,lx2160a-qds
|
||||
- fsl,lx2160a-rdb
|
||||
- const: fsl,lx2160a
|
||||
|
||||
- description: S32V234 based Boards
|
||||
items:
|
||||
- enum:
|
||||
|
@@ -1,706 +0,0 @@
|
||||
==========================================
|
||||
ARM idle states binding description
|
||||
==========================================
|
||||
|
||||
==========================================
|
||||
1 - Introduction
|
||||
==========================================
|
||||
|
||||
ARM systems contain HW capable of managing power consumption dynamically,
|
||||
where cores can be put in different low-power states (ranging from simple
|
||||
wfi to power gating) according to OS PM policies. The CPU states representing
|
||||
the range of dynamic idle states that a processor can enter at run-time, can be
|
||||
specified through device tree bindings representing the parameters required
|
||||
to enter/exit specific idle states on a given processor.
|
||||
|
||||
According to the Server Base System Architecture document (SBSA, [3]), the
|
||||
power states an ARM CPU can be put into are identified by the following list:
|
||||
|
||||
- Running
|
||||
- Idle_standby
|
||||
- Idle_retention
|
||||
- Sleep
|
||||
- Off
|
||||
|
||||
The power states described in the SBSA document define the basic CPU states on
|
||||
top of which ARM platforms implement power management schemes that allow an OS
|
||||
PM implementation to put the processor in different idle states (which include
|
||||
states listed above; "off" state is not an idle state since it does not have
|
||||
wake-up capabilities, hence it is not considered in this document).
|
||||
|
||||
Idle state parameters (e.g. entry latency) are platform specific and need to be
|
||||
characterized with bindings that provide the required information to OS PM
|
||||
code so that it can build the required tables and use them at runtime.
|
||||
|
||||
The device tree binding definition for ARM idle states is the subject of this
|
||||
document.
|
||||
|
||||
===========================================
|
||||
2 - idle-states definitions
|
||||
===========================================
|
||||
|
||||
Idle states are characterized for a specific system through a set of
|
||||
timing and energy related properties, that underline the HW behaviour
|
||||
triggered upon idle states entry and exit.
|
||||
|
||||
The following diagram depicts the CPU execution phases and related timing
|
||||
properties required to enter and exit an idle state:
|
||||
|
||||
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
|
||||
| | | | |
|
||||
|
||||
|<------ entry ------->|
|
||||
| latency |
|
||||
|<- exit ->|
|
||||
| latency |
|
||||
|<-------- min-residency -------->|
|
||||
|<------- wakeup-latency ------->|
|
||||
|
||||
Diagram 1: CPU idle state execution phases
|
||||
|
||||
EXEC: Normal CPU execution.
|
||||
|
||||
PREP: Preparation phase before committing the hardware to idle mode
|
||||
like cache flushing. This is abortable on pending wake-up
|
||||
event conditions. The abort latency is assumed to be negligible
|
||||
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
|
||||
goes back to EXEC. This phase is optional. If not abortable,
|
||||
this should be included in the ENTRY phase instead.
|
||||
|
||||
ENTRY: The hardware is committed to idle mode. This period must run
|
||||
to completion up to IDLE before anything else can happen.
|
||||
|
||||
IDLE: This is the actual energy-saving idle period. This may last
|
||||
between 0 and infinite time, until a wake-up event occurs.
|
||||
|
||||
EXIT: Period during which the CPU is brought back to operational
|
||||
mode (EXEC).
|
||||
|
||||
entry-latency: Worst case latency required to enter the idle state. The
|
||||
exit-latency may be guaranteed only after entry-latency has passed.
|
||||
|
||||
min-residency: Minimum period, including preparation and entry, for a given
|
||||
idle state to be worthwhile energywise.
|
||||
|
||||
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
|
||||
CPU being able to execute normal code again. If not specified, this is assumed
|
||||
to be entry-latency + exit-latency.
|
||||
|
||||
These timing parameters can be used by an OS in different circumstances.
|
||||
|
||||
An idle CPU requires the expected min-residency time to select the most
|
||||
appropriate idle state based on the expected expiry time of the next IRQ
|
||||
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
|
||||
|
||||
An operating system scheduler may need to compute the shortest wake-up delay
|
||||
for CPUs in the system by detecting how long will it take to get a CPU out
|
||||
of an idle state, e.g.:
|
||||
|
||||
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
|
||||
|
||||
In other words, the scheduler can make its scheduling decision by selecting
|
||||
(e.g. waking-up) the CPU with the shortest wake-up delay.
|
||||
The wake-up delay must take into account the entry latency if that period
|
||||
has not expired. The abortable nature of the PREP period can be ignored
|
||||
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
|
||||
the worst case since it depends on the CPU operating conditions, i.e. caches
|
||||
state).
|
||||
|
||||
An OS has to reliably probe the wakeup-latency since some devices can enforce
|
||||
latency constraint guarantees to work properly, so the OS has to detect the
|
||||
worst case wake-up latency it can incur if a CPU is allowed to enter an
|
||||
idle state, and possibly to prevent that to guarantee reliable device
|
||||
functioning.
|
||||
|
||||
The min-residency time parameter deserves further explanation since it is
|
||||
expressed in time units but must factor in energy consumption coefficients.
|
||||
|
||||
The energy consumption of a cpu when it enters a power state can be roughly
|
||||
characterised by the following graph:
|
||||
|
||||
|
|
||||
|
|
||||
|
|
||||
e |
|
||||
n | /---
|
||||
e | /------
|
||||
r | /------
|
||||
g | /-----
|
||||
y | /------
|
||||
| ----
|
||||
| /|
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
|/ |
|
||||
-----|-------+----------------------------------
|
||||
0| 1 time(ms)
|
||||
|
||||
Graph 1: Energy vs time example
|
||||
|
||||
The graph is split in two parts delimited by time 1ms on the X-axis.
|
||||
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
|
||||
and denotes the energy costs incurred while entering and leaving the idle
|
||||
state.
|
||||
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
|
||||
shallower slope and essentially represents the energy consumption of the idle
|
||||
state.
|
||||
|
||||
min-residency is defined for a given idle state as the minimum expected
|
||||
residency time for a state (inclusive of preparation and entry) after
|
||||
which choosing that state become the most energy efficient option. A good
|
||||
way to visualise this, is by taking the same graph above and comparing some
|
||||
states energy consumptions plots.
|
||||
|
||||
For sake of simplicity, let's consider a system with two idle states IDLE1,
|
||||
and IDLE2:
|
||||
|
||||
|
|
||||
|
|
||||
|
|
||||
| /-- IDLE1
|
||||
e | /---
|
||||
n | /----
|
||||
e | /---
|
||||
r | /-----/--------- IDLE2
|
||||
g | /-------/---------
|
||||
y | ------------ /---|
|
||||
| / /---- |
|
||||
| / /--- |
|
||||
| / /---- |
|
||||
| / /--- |
|
||||
| --- |
|
||||
| / |
|
||||
| / |
|
||||
|/ | time
|
||||
---/----------------------------+------------------------
|
||||
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
||||
|
|
||||
IDLE2-min-residency
|
||||
|
||||
Graph 2: idle states min-residency example
|
||||
|
||||
In graph 2 above, that takes into account idle states entry/exit energy
|
||||
costs, it is clear that if the idle state residency time (i.e. time till next
|
||||
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
|
||||
choice energywise.
|
||||
|
||||
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
|
||||
than IDLE2.
|
||||
|
||||
However, the lower power consumption (i.e. shallower energy curve slope) of
|
||||
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
|
||||
efficient.
|
||||
|
||||
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
|
||||
shallower states in a system with multiple idle states) is defined
|
||||
IDLE2-min-residency and corresponds to the time when energy consumption of
|
||||
IDLE1 and IDLE2 states breaks even.
|
||||
|
||||
The definitions provided in this section underpin the idle states
|
||||
properties specification that is the subject of the following sections.
|
||||
|
||||
===========================================
|
||||
3 - idle-states node
|
||||
===========================================
|
||||
|
||||
ARM processor idle states are defined within the idle-states node, which is
|
||||
a direct child of the cpus node [1] and provides a container where the
|
||||
processor idle states, defined as device tree nodes, are listed.
|
||||
|
||||
- idle-states node
|
||||
|
||||
Usage: Optional - On ARM systems, it is a container of processor idle
|
||||
states nodes. If the system does not provide CPU
|
||||
power management capabilities, or the processor just
|
||||
supports idle_standby, an idle-states node is not
|
||||
required.
|
||||
|
||||
Description: idle-states node is a container node, where its
|
||||
subnodes describe the CPU idle states.
|
||||
|
||||
Node name must be "idle-states".
|
||||
|
||||
The idle-states node's parent node must be the cpus node.
|
||||
|
||||
The idle-states node's child nodes can be:
|
||||
|
||||
- one or more state nodes
|
||||
|
||||
Any other configuration is considered invalid.
|
||||
|
||||
An idle-states node defines the following properties:
|
||||
|
||||
- entry-method
|
||||
Value type: <stringlist>
|
||||
Usage and definition depend on ARM architecture version.
|
||||
# On ARM v8 64-bit this property is required and must
|
||||
be:
|
||||
- "psci"
|
||||
# On ARM 32-bit systems this property is optional
|
||||
|
||||
This assumes that the "enable-method" property is set to "psci" in the cpu
|
||||
node[6] that is responsible for setting up CPU idle management in the OS
|
||||
implementation.
|
||||
|
||||
The nodes describing the idle states (state) can only be defined
|
||||
within the idle-states node, any other configuration is considered invalid
|
||||
and therefore must be ignored.
|
||||
|
||||
===========================================
|
||||
4 - state node
|
||||
===========================================
|
||||
|
||||
A state node represents an idle state description and must be defined as
|
||||
follows:
|
||||
|
||||
- state node
|
||||
|
||||
Description: must be child of the idle-states node
|
||||
|
||||
The state node name shall follow standard device tree naming
|
||||
rules ([5], 2.2.1 "Node names"), in particular state nodes which
|
||||
are siblings within a single common parent must be given a unique name.
|
||||
|
||||
The idle state entered by executing the wfi instruction (idle_standby
|
||||
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
|
||||
must not be listed.
|
||||
|
||||
With the definitions provided above, the following list represents
|
||||
the valid properties for a state node:
|
||||
|
||||
- compatible
|
||||
Usage: Required
|
||||
Value type: <stringlist>
|
||||
Definition: Must be "arm,idle-state".
|
||||
|
||||
- local-timer-stop
|
||||
Usage: See definition
|
||||
Value type: <none>
|
||||
Definition: if present the CPU local timer control logic is
|
||||
lost on state entry, otherwise it is retained.
|
||||
|
||||
- entry-latency-us
|
||||
Usage: Required
|
||||
Value type: <prop-encoded-array>
|
||||
Definition: u32 value representing worst case latency in
|
||||
microseconds required to enter the idle state.
|
||||
|
||||
- exit-latency-us
|
||||
Usage: Required
|
||||
Value type: <prop-encoded-array>
|
||||
Definition: u32 value representing worst case latency
|
||||
in microseconds required to exit the idle state.
|
||||
The exit-latency-us duration may be guaranteed
|
||||
only after entry-latency-us has passed.
|
||||
|
||||
- min-residency-us
|
||||
Usage: Required
|
||||
Value type: <prop-encoded-array>
|
||||
Definition: u32 value representing minimum residency duration
|
||||
in microseconds, inclusive of preparation and
|
||||
entry, for this idle state to be considered
|
||||
worthwhile energy wise (refer to section 2 of
|
||||
this document for a complete description).
|
||||
|
||||
- wakeup-latency-us:
|
||||
Usage: Optional
|
||||
Value type: <prop-encoded-array>
|
||||
Definition: u32 value representing maximum delay between the
|
||||
signaling of a wake-up event and the CPU being
|
||||
able to execute normal code again. If omitted,
|
||||
this is assumed to be equal to:
|
||||
|
||||
entry-latency-us + exit-latency-us
|
||||
|
||||
It is important to supply this value on systems
|
||||
where the duration of PREP phase (see diagram 1,
|
||||
section 2) is non-neglibigle.
|
||||
In such systems entry-latency-us + exit-latency-us
|
||||
will exceed wakeup-latency-us by this duration.
|
||||
|
||||
- status:
|
||||
Usage: Optional
|
||||
Value type: <string>
|
||||
Definition: A standard device tree property [5] that indicates
|
||||
the operational status of an idle-state.
|
||||
If present, it shall be:
|
||||
"okay": to indicate that the idle state is
|
||||
operational.
|
||||
"disabled": to indicate that the idle state has
|
||||
been disabled in firmware so it is not
|
||||
operational.
|
||||
If the property is not present the idle-state must
|
||||
be considered operational.
|
||||
|
||||
- idle-state-name:
|
||||
Usage: Optional
|
||||
Value type: <string>
|
||||
Definition: A string used as a descriptive name for the idle
|
||||
state.
|
||||
|
||||
In addition to the properties listed above, a state node may require
|
||||
additional properties specific to the entry-method defined in the
|
||||
idle-states node. Please refer to the entry-method bindings
|
||||
documentation for properties definitions.
|
||||
|
||||
===========================================
|
||||
4 - Examples
|
||||
===========================================
|
||||
|
||||
Example 1 (ARM 64-bit, 16-cpu system, PSCI enable-method):
|
||||
|
||||
cpus {
|
||||
#size-cells = <0>;
|
||||
#address-cells = <2>;
|
||||
|
||||
CPU0: cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x0>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU1: cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x1>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU2: cpu@100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU3: cpu@101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU4: cpu@10000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10000>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU5: cpu@10001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10001>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU6: cpu@10100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU7: cpu@10101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU8: cpu@100000000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x0>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU9: cpu@100000001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x1>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU10: cpu@100000100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU11: cpu@100000101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU12: cpu@100010000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10000>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU13: cpu@100010001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10001>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU14: cpu@100010100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU15: cpu@100010101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
idle-states {
|
||||
entry-method = "psci";
|
||||
|
||||
CPU_RETENTION_0_0: cpu-retention-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <20>;
|
||||
exit-latency-us = <40>;
|
||||
min-residency-us = <80>;
|
||||
};
|
||||
|
||||
CLUSTER_RETENTION_0: cluster-retention-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <50>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <250>;
|
||||
wakeup-latency-us = <130>;
|
||||
};
|
||||
|
||||
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <250>;
|
||||
exit-latency-us = <500>;
|
||||
min-residency-us = <950>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <600>;
|
||||
exit-latency-us = <1100>;
|
||||
min-residency-us = <2700>;
|
||||
wakeup-latency-us = <1500>;
|
||||
};
|
||||
|
||||
CPU_RETENTION_1_0: cpu-retention-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <20>;
|
||||
exit-latency-us = <40>;
|
||||
min-residency-us = <90>;
|
||||
};
|
||||
|
||||
CLUSTER_RETENTION_1: cluster-retention-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <50>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <270>;
|
||||
wakeup-latency-us = <100>;
|
||||
};
|
||||
|
||||
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <70>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <300>;
|
||||
wakeup-latency-us = <150>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <500>;
|
||||
exit-latency-us = <1200>;
|
||||
min-residency-us = <3500>;
|
||||
wakeup-latency-us = <1300>;
|
||||
};
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
Example 2 (ARM 32-bit, 8-cpu system, two clusters):
|
||||
|
||||
cpus {
|
||||
#size-cells = <0>;
|
||||
#address-cells = <1>;
|
||||
|
||||
CPU0: cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x0>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU1: cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x1>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU2: cpu@2 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x2>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU3: cpu@3 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x3>;
|
||||
cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
CPU4: cpu@100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x100>;
|
||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU5: cpu@101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x101>;
|
||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU6: cpu@102 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x102>;
|
||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
CPU7: cpu@103 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x103>;
|
||||
cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
idle-states {
|
||||
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <200>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <400>;
|
||||
wakeup-latency-us = <250>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <500>;
|
||||
exit-latency-us = <1500>;
|
||||
min-residency-us = <2500>;
|
||||
wakeup-latency-us = <1700>;
|
||||
};
|
||||
|
||||
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <300>;
|
||||
exit-latency-us = <500>;
|
||||
min-residency-us = <900>;
|
||||
wakeup-latency-us = <600>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <800>;
|
||||
exit-latency-us = <2000>;
|
||||
min-residency-us = <6500>;
|
||||
wakeup-latency-us = <2300>;
|
||||
};
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
===========================================
|
||||
5 - References
|
||||
===========================================
|
||||
|
||||
[1] ARM Linux Kernel documentation - CPUs bindings
|
||||
Documentation/devicetree/bindings/arm/cpus.yaml
|
||||
|
||||
[2] ARM Linux Kernel documentation - PSCI bindings
|
||||
Documentation/devicetree/bindings/arm/psci.yaml
|
||||
|
||||
[3] ARM Server Base System Architecture (SBSA)
|
||||
http://infocenter.arm.com/help/index.jsp
|
||||
|
||||
[4] ARM Architecture Reference Manuals
|
||||
http://infocenter.arm.com/help/index.jsp
|
||||
|
||||
[5] Devicetree Specification
|
||||
https://www.devicetree.org/specifications/
|
||||
|
||||
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
|
||||
Documentation/arm64/booting.rst
|
661
Documentation/devicetree/bindings/arm/idle-states.yaml
Normal file
661
Documentation/devicetree/bindings/arm/idle-states.yaml
Normal file
@@ -0,0 +1,661 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/arm/idle-states.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: ARM idle states binding description
|
||||
|
||||
maintainers:
|
||||
- Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
|
||||
|
||||
description: |+
|
||||
==========================================
|
||||
1 - Introduction
|
||||
==========================================
|
||||
|
||||
ARM systems contain HW capable of managing power consumption dynamically,
|
||||
where cores can be put in different low-power states (ranging from simple wfi
|
||||
to power gating) according to OS PM policies. The CPU states representing the
|
||||
range of dynamic idle states that a processor can enter at run-time, can be
|
||||
specified through device tree bindings representing the parameters required to
|
||||
enter/exit specific idle states on a given processor.
|
||||
|
||||
According to the Server Base System Architecture document (SBSA, [3]), the
|
||||
power states an ARM CPU can be put into are identified by the following list:
|
||||
|
||||
- Running
|
||||
- Idle_standby
|
||||
- Idle_retention
|
||||
- Sleep
|
||||
- Off
|
||||
|
||||
The power states described in the SBSA document define the basic CPU states on
|
||||
top of which ARM platforms implement power management schemes that allow an OS
|
||||
PM implementation to put the processor in different idle states (which include
|
||||
states listed above; "off" state is not an idle state since it does not have
|
||||
wake-up capabilities, hence it is not considered in this document).
|
||||
|
||||
Idle state parameters (e.g. entry latency) are platform specific and need to
|
||||
be characterized with bindings that provide the required information to OS PM
|
||||
code so that it can build the required tables and use them at runtime.
|
||||
|
||||
The device tree binding definition for ARM idle states is the subject of this
|
||||
document.
|
||||
|
||||
===========================================
|
||||
2 - idle-states definitions
|
||||
===========================================
|
||||
|
||||
Idle states are characterized for a specific system through a set of
|
||||
timing and energy related properties, that underline the HW behaviour
|
||||
triggered upon idle states entry and exit.
|
||||
|
||||
The following diagram depicts the CPU execution phases and related timing
|
||||
properties required to enter and exit an idle state:
|
||||
|
||||
..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
|
||||
| | | | |
|
||||
|
||||
|<------ entry ------->|
|
||||
| latency |
|
||||
|<- exit ->|
|
||||
| latency |
|
||||
|<-------- min-residency -------->|
|
||||
|<------- wakeup-latency ------->|
|
||||
|
||||
Diagram 1: CPU idle state execution phases
|
||||
|
||||
EXEC: Normal CPU execution.
|
||||
|
||||
PREP: Preparation phase before committing the hardware to idle mode
|
||||
like cache flushing. This is abortable on pending wake-up
|
||||
event conditions. The abort latency is assumed to be negligible
|
||||
(i.e. less than the ENTRY + EXIT duration). If aborted, CPU
|
||||
goes back to EXEC. This phase is optional. If not abortable,
|
||||
this should be included in the ENTRY phase instead.
|
||||
|
||||
ENTRY: The hardware is committed to idle mode. This period must run
|
||||
to completion up to IDLE before anything else can happen.
|
||||
|
||||
IDLE: This is the actual energy-saving idle period. This may last
|
||||
between 0 and infinite time, until a wake-up event occurs.
|
||||
|
||||
EXIT: Period during which the CPU is brought back to operational
|
||||
mode (EXEC).
|
||||
|
||||
entry-latency: Worst case latency required to enter the idle state. The
|
||||
exit-latency may be guaranteed only after entry-latency has passed.
|
||||
|
||||
min-residency: Minimum period, including preparation and entry, for a given
|
||||
idle state to be worthwhile energywise.
|
||||
|
||||
wakeup-latency: Maximum delay between the signaling of a wake-up event and the
|
||||
CPU being able to execute normal code again. If not specified, this is assumed
|
||||
to be entry-latency + exit-latency.
|
||||
|
||||
These timing parameters can be used by an OS in different circumstances.
|
||||
|
||||
An idle CPU requires the expected min-residency time to select the most
|
||||
appropriate idle state based on the expected expiry time of the next IRQ
|
||||
(i.e. wake-up) that causes the CPU to return to the EXEC phase.
|
||||
|
||||
An operating system scheduler may need to compute the shortest wake-up delay
|
||||
for CPUs in the system by detecting how long will it take to get a CPU out
|
||||
of an idle state, e.g.:
|
||||
|
||||
wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
|
||||
|
||||
In other words, the scheduler can make its scheduling decision by selecting
|
||||
(e.g. waking-up) the CPU with the shortest wake-up delay.
|
||||
The wake-up delay must take into account the entry latency if that period
|
||||
has not expired. The abortable nature of the PREP period can be ignored
|
||||
if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
|
||||
the worst case since it depends on the CPU operating conditions, i.e. caches
|
||||
state).
|
||||
|
||||
An OS has to reliably probe the wakeup-latency since some devices can enforce
|
||||
latency constraint guarantees to work properly, so the OS has to detect the
|
||||
worst case wake-up latency it can incur if a CPU is allowed to enter an
|
||||
idle state, and possibly to prevent that to guarantee reliable device
|
||||
functioning.
|
||||
|
||||
The min-residency time parameter deserves further explanation since it is
|
||||
expressed in time units but must factor in energy consumption coefficients.
|
||||
|
||||
The energy consumption of a cpu when it enters a power state can be roughly
|
||||
characterised by the following graph:
|
||||
|
||||
|
|
||||
|
|
||||
|
|
||||
e |
|
||||
n | /---
|
||||
e | /------
|
||||
r | /------
|
||||
g | /-----
|
||||
y | /------
|
||||
| ----
|
||||
| /|
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
| / |
|
||||
|/ |
|
||||
-----|-------+----------------------------------
|
||||
0| 1 time(ms)
|
||||
|
||||
Graph 1: Energy vs time example
|
||||
|
||||
The graph is split in two parts delimited by time 1ms on the X-axis.
|
||||
The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
|
||||
and denotes the energy costs incurred while entering and leaving the idle
|
||||
state.
|
||||
The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
|
||||
shallower slope and essentially represents the energy consumption of the idle
|
||||
state.
|
||||
|
||||
min-residency is defined for a given idle state as the minimum expected
|
||||
residency time for a state (inclusive of preparation and entry) after
|
||||
which choosing that state become the most energy efficient option. A good
|
||||
way to visualise this, is by taking the same graph above and comparing some
|
||||
states energy consumptions plots.
|
||||
|
||||
For sake of simplicity, let's consider a system with two idle states IDLE1,
|
||||
and IDLE2:
|
||||
|
||||
|
|
||||
|
|
||||
|
|
||||
| /-- IDLE1
|
||||
e | /---
|
||||
n | /----
|
||||
e | /---
|
||||
r | /-----/--------- IDLE2
|
||||
g | /-------/---------
|
||||
y | ------------ /---|
|
||||
| / /---- |
|
||||
| / /--- |
|
||||
| / /---- |
|
||||
| / /--- |
|
||||
| --- |
|
||||
| / |
|
||||
| / |
|
||||
|/ | time
|
||||
---/----------------------------+------------------------
|
||||
|IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
|
||||
|
|
||||
IDLE2-min-residency
|
||||
|
||||
Graph 2: idle states min-residency example
|
||||
|
||||
In graph 2 above, that takes into account idle states entry/exit energy
|
||||
costs, it is clear that if the idle state residency time (i.e. time till next
|
||||
wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
|
||||
choice energywise.
|
||||
|
||||
This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
|
||||
than IDLE2.
|
||||
|
||||
However, the lower power consumption (i.e. shallower energy curve slope) of
|
||||
idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
|
||||
efficient.
|
||||
|
||||
The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
|
||||
shallower states in a system with multiple idle states) is defined
|
||||
IDLE2-min-residency and corresponds to the time when energy consumption of
|
||||
IDLE1 and IDLE2 states breaks even.
|
||||
|
||||
The definitions provided in this section underpin the idle states
|
||||
properties specification that is the subject of the following sections.
|
||||
|
||||
===========================================
|
||||
3 - idle-states node
|
||||
===========================================
|
||||
|
||||
ARM processor idle states are defined within the idle-states node, which is
|
||||
a direct child of the cpus node [1] and provides a container where the
|
||||
processor idle states, defined as device tree nodes, are listed.
|
||||
|
||||
On ARM systems, it is a container of processor idle states nodes. If the
|
||||
system does not provide CPU power management capabilities, or the processor
|
||||
just supports idle_standby, an idle-states node is not required.
|
||||
|
||||
===========================================
|
||||
4 - References
|
||||
===========================================
|
||||
|
||||
[1] ARM Linux Kernel documentation - CPUs bindings
|
||||
Documentation/devicetree/bindings/arm/cpus.yaml
|
||||
|
||||
[2] ARM Linux Kernel documentation - PSCI bindings
|
||||
Documentation/devicetree/bindings/arm/psci.yaml
|
||||
|
||||
[3] ARM Server Base System Architecture (SBSA)
|
||||
http://infocenter.arm.com/help/index.jsp
|
||||
|
||||
[4] ARM Architecture Reference Manuals
|
||||
http://infocenter.arm.com/help/index.jsp
|
||||
|
||||
[6] ARM Linux Kernel documentation - Booting AArch64 Linux
|
||||
Documentation/arm64/booting.rst
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
const: idle-states
|
||||
|
||||
entry-method:
|
||||
description: |
|
||||
Usage and definition depend on ARM architecture version.
|
||||
|
||||
On ARM v8 64-bit this property is required.
|
||||
On ARM 32-bit systems this property is optional
|
||||
|
||||
This assumes that the "enable-method" property is set to "psci" in the cpu
|
||||
node[6] that is responsible for setting up CPU idle management in the OS
|
||||
implementation.
|
||||
const: psci
|
||||
|
||||
patternProperties:
|
||||
"^(cpu|cluster)-":
|
||||
type: object
|
||||
description: |
|
||||
Each state node represents an idle state description and must be defined
|
||||
as follows.
|
||||
|
||||
The idle state entered by executing the wfi instruction (idle_standby
|
||||
SBSA,[3][4]) is considered standard on all ARM platforms and therefore
|
||||
must not be listed.
|
||||
|
||||
In addition to the properties listed above, a state node may require
|
||||
additional properties specific to the entry-method defined in the
|
||||
idle-states node. Please refer to the entry-method bindings
|
||||
documentation for properties definitions.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: arm,idle-state
|
||||
|
||||
local-timer-stop:
|
||||
description:
|
||||
If present the CPU local timer control logic is
|
||||
lost on state entry, otherwise it is retained.
|
||||
type: boolean
|
||||
|
||||
entry-latency-us:
|
||||
description:
|
||||
Worst case latency in microseconds required to enter the idle state.
|
||||
|
||||
exit-latency-us:
|
||||
description:
|
||||
Worst case latency in microseconds required to exit the idle state.
|
||||
The exit-latency-us duration may be guaranteed only after
|
||||
entry-latency-us has passed.
|
||||
|
||||
min-residency-us:
|
||||
description:
|
||||
Minimum residency duration in microseconds, inclusive of preparation
|
||||
and entry, for this idle state to be considered worthwhile energy wise
|
||||
(refer to section 2 of this document for a complete description).
|
||||
|
||||
wakeup-latency-us:
|
||||
description: |
|
||||
Maximum delay between the signaling of a wake-up event and the CPU
|
||||
being able to execute normal code again. If omitted, this is assumed
|
||||
to be equal to:
|
||||
|
||||
entry-latency-us + exit-latency-us
|
||||
|
||||
It is important to supply this value on systems where the duration of
|
||||
PREP phase (see diagram 1, section 2) is non-neglibigle. In such
|
||||
systems entry-latency-us + exit-latency-us will exceed
|
||||
wakeup-latency-us by this duration.
|
||||
|
||||
idle-state-name:
|
||||
$ref: /schemas/types.yaml#definitions/string
|
||||
description:
|
||||
A string used as a descriptive name for the idle state.
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- entry-latency-us
|
||||
- exit-latency-us
|
||||
- min-residency-us
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
|
||||
cpus {
|
||||
#size-cells = <0>;
|
||||
#address-cells = <2>;
|
||||
|
||||
cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x0>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x1>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@10000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10000>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@10001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10001>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@10100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@10101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x0 0x10101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
|
||||
&CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
|
||||
};
|
||||
|
||||
cpu@100000000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x0>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100000001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x1>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100000100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100000101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100010000 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10000>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100010001 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10001>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100010100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
cpu@100010101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x1 0x10101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
|
||||
&CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
|
||||
};
|
||||
|
||||
idle-states {
|
||||
entry-method = "psci";
|
||||
|
||||
CPU_RETENTION_0_0: cpu-retention-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <20>;
|
||||
exit-latency-us = <40>;
|
||||
min-residency-us = <80>;
|
||||
};
|
||||
|
||||
CLUSTER_RETENTION_0: cluster-retention-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <50>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <250>;
|
||||
wakeup-latency-us = <130>;
|
||||
};
|
||||
|
||||
CPU_SLEEP_0_0: cpu-sleep-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <250>;
|
||||
exit-latency-us = <500>;
|
||||
min-residency-us = <950>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_0: cluster-sleep-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <600>;
|
||||
exit-latency-us = <1100>;
|
||||
min-residency-us = <2700>;
|
||||
wakeup-latency-us = <1500>;
|
||||
};
|
||||
|
||||
CPU_RETENTION_1_0: cpu-retention-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <20>;
|
||||
exit-latency-us = <40>;
|
||||
min-residency-us = <90>;
|
||||
};
|
||||
|
||||
CLUSTER_RETENTION_1: cluster-retention-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <50>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <270>;
|
||||
wakeup-latency-us = <100>;
|
||||
};
|
||||
|
||||
CPU_SLEEP_1_0: cpu-sleep-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x0010000>;
|
||||
entry-latency-us = <70>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <300>;
|
||||
wakeup-latency-us = <150>;
|
||||
};
|
||||
|
||||
CLUSTER_SLEEP_1: cluster-sleep-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
arm,psci-suspend-param = <0x1010000>;
|
||||
entry-latency-us = <500>;
|
||||
exit-latency-us = <1200>;
|
||||
min-residency-us = <3500>;
|
||||
wakeup-latency-us = <1300>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
- |
|
||||
// Example 2 (ARM 32-bit, 8-cpu system, two clusters):
|
||||
|
||||
cpus {
|
||||
#size-cells = <0>;
|
||||
#address-cells = <1>;
|
||||
|
||||
cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x0>;
|
||||
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||
};
|
||||
|
||||
cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x1>;
|
||||
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||
};
|
||||
|
||||
cpu@2 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x2>;
|
||||
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||
};
|
||||
|
||||
cpu@3 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a15";
|
||||
reg = <0x3>;
|
||||
cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
|
||||
};
|
||||
|
||||
cpu@100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x100>;
|
||||
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||
};
|
||||
|
||||
cpu@101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x101>;
|
||||
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||
};
|
||||
|
||||
cpu@102 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x102>;
|
||||
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||
};
|
||||
|
||||
cpu@103 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a7";
|
||||
reg = <0x103>;
|
||||
cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
|
||||
};
|
||||
|
||||
idle-states {
|
||||
cpu_sleep_0_0: cpu-sleep-0-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <200>;
|
||||
exit-latency-us = <100>;
|
||||
min-residency-us = <400>;
|
||||
wakeup-latency-us = <250>;
|
||||
};
|
||||
|
||||
cluster_sleep_0: cluster-sleep-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <500>;
|
||||
exit-latency-us = <1500>;
|
||||
min-residency-us = <2500>;
|
||||
wakeup-latency-us = <1700>;
|
||||
};
|
||||
|
||||
cpu_sleep_1_0: cpu-sleep-1-0 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <300>;
|
||||
exit-latency-us = <500>;
|
||||
min-residency-us = <900>;
|
||||
wakeup-latency-us = <600>;
|
||||
};
|
||||
|
||||
cluster_sleep_1: cluster-sleep-1 {
|
||||
compatible = "arm,idle-state";
|
||||
local-timer-stop;
|
||||
entry-latency-us = <800>;
|
||||
exit-latency-us = <2000>;
|
||||
min-residency-us = <6500>;
|
||||
wakeup-latency-us = <2300>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
...
|
@@ -47,7 +47,7 @@ examples:
|
||||
- |
|
||||
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||
|
||||
cache-controller@1100000 {
|
||||
system-cache-controller@1100000 {
|
||||
compatible = "qcom,sdm845-llcc";
|
||||
reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
|
||||
reg-names = "llcc_base", "llcc_broadcast_base";
|
||||
|
@@ -102,6 +102,34 @@ properties:
|
||||
[1] Kernel documentation - ARM idle states bindings
|
||||
Documentation/devicetree/bindings/arm/idle-states.txt
|
||||
|
||||
"#power-domain-cells":
|
||||
description:
|
||||
The number of cells in a PM domain specifier as per binding in [3].
|
||||
Must be 0 as to represent a single PM domain.
|
||||
|
||||
ARM systems can have multiple cores, sometimes in an hierarchical
|
||||
arrangement. This often, but not always, maps directly to the processor
|
||||
power topology of the system. Individual nodes in a topology have their
|
||||
own specific power states and can be better represented hierarchically.
|
||||
|
||||
For these cases, the definitions of the idle states for the CPUs and the
|
||||
CPU topology, must conform to the binding in [3]. The idle states
|
||||
themselves must conform to the binding in [4] and must specify the
|
||||
arm,psci-suspend-param property.
|
||||
|
||||
It should also be noted that, in PSCI firmware v1.0 the OS-Initiated
|
||||
(OSI) CPU suspend mode is introduced. Using a hierarchical representation
|
||||
helps to implement support for OSI mode and OS implementations may choose
|
||||
to mandate it.
|
||||
|
||||
[3] Documentation/devicetree/bindings/power/power_domain.txt
|
||||
[4] Documentation/devicetree/bindings/power/domain-idle-state.txt
|
||||
|
||||
power-domains:
|
||||
$ref: '/schemas/types.yaml#/definitions/phandle-array'
|
||||
description:
|
||||
List of phandles and PM domain specifiers, as defined by bindings of the
|
||||
PM domain provider.
|
||||
|
||||
required:
|
||||
- compatible
|
||||
@@ -160,4 +188,80 @@ examples:
|
||||
cpu_on = <0x95c10002>;
|
||||
cpu_off = <0x95c10001>;
|
||||
};
|
||||
|
||||
- |+
|
||||
|
||||
// Case 4: CPUs and CPU idle states described using the hierarchical model.
|
||||
|
||||
cpus {
|
||||
#size-cells = <0>;
|
||||
#address-cells = <1>;
|
||||
|
||||
CPU0: cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53", "arm,armv8";
|
||||
reg = <0x0>;
|
||||
enable-method = "psci";
|
||||
power-domains = <&CPU_PD0>;
|
||||
power-domain-names = "psci";
|
||||
};
|
||||
|
||||
CPU1: cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57", "arm,armv8";
|
||||
reg = <0x100>;
|
||||
enable-method = "psci";
|
||||
power-domains = <&CPU_PD1>;
|
||||
power-domain-names = "psci";
|
||||
};
|
||||
|
||||
idle-states {
|
||||
|
||||
CPU_PWRDN: cpu-power-down {
|
||||
compatible = "arm,idle-state";
|
||||
arm,psci-suspend-param = <0x0000001>;
|
||||
entry-latency-us = <10>;
|
||||
exit-latency-us = <10>;
|
||||
min-residency-us = <100>;
|
||||
};
|
||||
|
||||
CLUSTER_RET: cluster-retention {
|
||||
compatible = "domain-idle-state";
|
||||
arm,psci-suspend-param = <0x1000011>;
|
||||
entry-latency-us = <500>;
|
||||
exit-latency-us = <500>;
|
||||
min-residency-us = <2000>;
|
||||
};
|
||||
|
||||
CLUSTER_PWRDN: cluster-power-down {
|
||||
compatible = "domain-idle-state";
|
||||
arm,psci-suspend-param = <0x1000031>;
|
||||
entry-latency-us = <2000>;
|
||||
exit-latency-us = <2000>;
|
||||
min-residency-us = <6000>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
psci {
|
||||
compatible = "arm,psci-1.0";
|
||||
method = "smc";
|
||||
|
||||
CPU_PD0: cpu-pd0 {
|
||||
#power-domain-cells = <0>;
|
||||
domain-idle-states = <&CPU_PWRDN>;
|
||||
power-domains = <&CLUSTER_PD>;
|
||||
};
|
||||
|
||||
CPU_PD1: cpu-pd1 {
|
||||
#power-domain-cells = <0>;
|
||||
domain-idle-states = <&CPU_PWRDN>;
|
||||
power-domains = <&CLUSTER_PD>;
|
||||
};
|
||||
|
||||
CLUSTER_PD: cluster-pd {
|
||||
#power-domain-cells = <0>;
|
||||
domain-idle-states = <&CLUSTER_RET>, <&CLUSTER_PWRDN>;
|
||||
};
|
||||
};
|
||||
...
|
||||
|
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/bindings/arm/qcom.yaml#
|
||||
$id: http://devicetree.org/schemas/arm/qcom.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: QCOM device tree bindings
|
||||
@@ -24,28 +24,30 @@ description: |
|
||||
|
||||
The 'SoC' element must be one of the following strings:
|
||||
|
||||
apq8016
|
||||
apq8074
|
||||
apq8084
|
||||
apq8096
|
||||
msm8916
|
||||
msm8974
|
||||
msm8992
|
||||
msm8994
|
||||
msm8996
|
||||
mdm9615
|
||||
ipq8074
|
||||
sdm845
|
||||
apq8016
|
||||
apq8074
|
||||
apq8084
|
||||
apq8096
|
||||
ipq8074
|
||||
mdm9615
|
||||
msm8916
|
||||
msm8974
|
||||
msm8992
|
||||
msm8994
|
||||
msm8996
|
||||
sc7180
|
||||
sdm845
|
||||
|
||||
The 'board' element must be one of the following strings:
|
||||
|
||||
cdp
|
||||
liquid
|
||||
dragonboard
|
||||
mtp
|
||||
sbc
|
||||
hk01
|
||||
qrd
|
||||
cdp
|
||||
dragonboard
|
||||
hk01
|
||||
idp
|
||||
liquid
|
||||
mtp
|
||||
qrd
|
||||
sbc
|
||||
|
||||
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
|
||||
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
|
||||
@@ -144,4 +146,8 @@ properties:
|
||||
- qcom,ipq8074-hk01
|
||||
- const: qcom,ipq8074
|
||||
|
||||
- items:
|
||||
- enum:
|
||||
- qcom,sc7180-idp
|
||||
- const: qcom,sc7180
|
||||
...
|
||||
|
@@ -409,6 +409,9 @@ properties:
|
||||
|
||||
- description: Pine64 RockPro64
|
||||
items:
|
||||
- enum:
|
||||
- pine64,rockpro64-v2.1
|
||||
- pine64,rockpro64-v2.0
|
||||
- const: pine64,rockpro64
|
||||
- const: rockchip,rk3399
|
||||
|
||||
@@ -422,6 +425,12 @@ properties:
|
||||
- const: radxa,rockpi4
|
||||
- const: rockchip,rk3399
|
||||
|
||||
- description: Radxa ROCK Pi N10
|
||||
items:
|
||||
- const: radxa,rockpi-n10
|
||||
- const: vamrs,rk3399pro-vmarc-som
|
||||
- const: rockchip,rk3399pro
|
||||
|
||||
- description: Radxa Rock2 Square
|
||||
items:
|
||||
- const: radxa,rock2-square
|
||||
|
@@ -2,7 +2,7 @@
|
||||
# Copyright 2019 Unisoc Inc.
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/arm/sprd.yaml#
|
||||
$id: http://devicetree.org/schemas/arm/sprd/sprd.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Unisoc platforms device tree bindings
|
@@ -1,37 +0,0 @@
|
||||
ML-AHB interconnect bindings
|
||||
|
||||
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
|
||||
a Cortex-M subsystem with dedicated memories.
|
||||
The MCU SRAM and RETRAM memory parts can be accessed through different addresses
|
||||
(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
|
||||
Cortex-M firmware accesses among those ports allows to tune the system
|
||||
performance.
|
||||
|
||||
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
|
||||
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
|
||||
|
||||
Required properties:
|
||||
- compatible: should be "simple-bus"
|
||||
- dma-ranges: describes memory addresses translation between the local CPU and
|
||||
the remote Cortex-M processor. Each memory region, is declared with
|
||||
3 parameters:
|
||||
- param 1: device base address (Cortex-M processor address)
|
||||
- param 2: physical base address (local CPU address)
|
||||
- param 3: size of the memory region.
|
||||
|
||||
The Cortex-M remote processor accessed via the mlahb interconnect is described
|
||||
by a child node.
|
||||
|
||||
Example:
|
||||
mlahb {
|
||||
compatible = "simple-bus";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
dma-ranges = <0x00000000 0x38000000 0x10000>,
|
||||
<0x10000000 0x10000000 0x60000>,
|
||||
<0x30000000 0x30000000 0x60000>;
|
||||
|
||||
m4_rproc: m4@10000000 {
|
||||
...
|
||||
};
|
||||
};
|
70
Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
Normal file
70
Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: "http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml#"
|
||||
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
|
||||
|
||||
title: STMicroelectronics STM32 ML-AHB interconnect bindings
|
||||
|
||||
maintainers:
|
||||
- Fabien Dessenne <fabien.dessenne@st.com>
|
||||
- Arnaud Pouliquen <arnaud.pouliquen@st.com>
|
||||
|
||||
description: |
|
||||
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
|
||||
a Cortex-M subsystem with dedicated memories. The MCU SRAM and RETRAM memory
|
||||
parts can be accessed through different addresses (see "RAM aliases" in [1])
|
||||
using different buses (see [2]): balancing the Cortex-M firmware accesses
|
||||
among those ports allows to tune the system performance.
|
||||
[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
|
||||
[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
|
||||
|
||||
allOf:
|
||||
- $ref: /schemas/simple-bus.yaml#
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
enum:
|
||||
- st,mlahb
|
||||
|
||||
dma-ranges:
|
||||
description: |
|
||||
Describe memory addresses translation between the local CPU and the
|
||||
remote Cortex-M processor. Each memory region, is declared with
|
||||
3 parameters:
|
||||
- param 1: device base address (Cortex-M processor address)
|
||||
- param 2: physical base address (local CPU address)
|
||||
- param 3: size of the memory region.
|
||||
maxItems: 3
|
||||
|
||||
'#address-cells':
|
||||
const: 1
|
||||
|
||||
'#size-cells':
|
||||
const: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- '#address-cells'
|
||||
- '#size-cells'
|
||||
- dma-ranges
|
||||
|
||||
examples:
|
||||
- |
|
||||
mlahb: ahb {
|
||||
compatible = "st,mlahb", "simple-bus";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
reg = <0x10000000 0x40000>;
|
||||
ranges;
|
||||
dma-ranges = <0x00000000 0x38000000 0x10000>,
|
||||
<0x10000000 0x10000000 0x60000>,
|
||||
<0x30000000 0x30000000 0x60000>;
|
||||
|
||||
m4_rproc: m4@10000000 {
|
||||
reg = <0x10000000 0x40000>;
|
||||
};
|
||||
};
|
||||
|
||||
...
|
@@ -0,0 +1,41 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: "http://devicetree.org/schemas/arm/stm32/st,stm32-syscon.yaml#"
|
||||
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
|
||||
|
||||
title: STMicroelectronics STM32 Platforms System Controller bindings
|
||||
|
||||
maintainers:
|
||||
- Alexandre Torgue <alexandre.torgue@st.com>
|
||||
- Christophe Roullier <christophe.roullier@st.com>
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
oneOf:
|
||||
- items:
|
||||
- enum:
|
||||
- st,stm32mp157-syscfg
|
||||
- const: syscon
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/clock/stm32mp1-clks.h>
|
||||
syscfg: syscon@50020000 {
|
||||
compatible = "st,stm32mp157-syscfg", "syscon";
|
||||
reg = <0x50020000 0x400>;
|
||||
clocks = <&rcc SYSCFG>;
|
||||
};
|
||||
|
||||
...
|
@@ -1,16 +0,0 @@
|
||||
STMicroelectronics STM32 Platforms System Controller
|
||||
|
||||
Properties:
|
||||
- compatible : should contain two values. First value must be :
|
||||
- " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
|
||||
second value must be always "syscon".
|
||||
- reg : offset and length of the register set.
|
||||
- clocks: phandle to the syscfg clock
|
||||
|
||||
Example:
|
||||
syscfg: syscon@50020000 {
|
||||
compatible = "st,stm32mp157-syscfg", "syscon";
|
||||
reg = <0x50020000 0x400>;
|
||||
clocks = <&rcc SYSCFG>;
|
||||
};
|
||||
|
@@ -342,6 +342,16 @@ properties:
|
||||
- const: libretech,all-h3-cc-h5
|
||||
- const: allwinner,sun50i-h5
|
||||
|
||||
- description: Libre Computer Board ALL-H3-IT H5
|
||||
items:
|
||||
- const: libretech,all-h3-it-h5
|
||||
- const: allwinner,sun50i-h5
|
||||
|
||||
- description: Libre Computer Board ALL-H5-CC H5
|
||||
items:
|
||||
- const: libretech,all-h5-cc-h5
|
||||
- const: allwinner,sun50i-h5
|
||||
|
||||
- description: Lichee Pi One
|
||||
items:
|
||||
- const: licheepi,licheepi-one
|
||||
@@ -470,6 +480,12 @@ properties:
|
||||
- const: emlid,neutis-n5
|
||||
- const: allwinner,sun50i-h5
|
||||
|
||||
- description: Emlid Neutis N5H3 Developper Board
|
||||
items:
|
||||
- const: emlid,neutis-n5h3-devboard
|
||||
- const: emlid,neutis-n5h3
|
||||
- const: allwinner,sun8i-h3
|
||||
|
||||
- description: NextThing Co. CHIP
|
||||
items:
|
||||
- const: nextthing,chip
|
||||
@@ -599,11 +615,16 @@ properties:
|
||||
- const: pine64,pine64-plus
|
||||
- const: allwinner,sun50i-a64
|
||||
|
||||
- description: Pine64 PineH64
|
||||
- description: Pine64 PineH64 model A
|
||||
items:
|
||||
- const: pine64,pine-h64
|
||||
- const: allwinner,sun50i-h6
|
||||
|
||||
- description: Pine64 PineH64 model B
|
||||
items:
|
||||
- const: pine64,pine-h64-model-b
|
||||
- const: allwinner,sun50i-h6
|
||||
|
||||
- description: Pine64 LTS
|
||||
items:
|
||||
- const: pine64,pine64-lts
|
||||
|
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun4i-a10-mbus.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner Memory Bus (MBUS) controller
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
description: |
|
||||
The MBUS controller drives the MBUS that other devices in the SoC
|
||||
will use to perform DMA. It also has a register interface that
|
||||
allows to monitor and control the bandwidth and priorities for
|
||||
masters on that bus.
|
||||
|
||||
Each device having to perform their DMA through the MBUS must have
|
||||
the interconnects and interconnect-names properties set to the MBUS
|
||||
controller and with "dma-mem" as the interconnect name.
|
||||
|
||||
properties:
|
||||
"#interconnect-cells":
|
||||
const: 1
|
||||
description:
|
||||
The content of the cell is the MBUS ID.
|
||||
|
||||
compatible:
|
||||
enum:
|
||||
- allwinner,sun5i-a13-mbus
|
||||
- allwinner,sun8i-h3-mbus
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
dma-ranges:
|
||||
description:
|
||||
See section 2.3.9 of the DeviceTree Specification.
|
||||
|
||||
required:
|
||||
- "#interconnect-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- dma-ranges
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/clock/sun5i-ccu.h>
|
||||
|
||||
mbus: dram-controller@1c01000 {
|
||||
compatible = "allwinner,sun5i-a13-mbus";
|
||||
reg = <0x01c01000 0x1000>;
|
||||
clocks = <&ccu CLK_MBUS>;
|
||||
dma-ranges = <0x00000000 0x40000000 0x20000000>;
|
||||
#interconnect-cells = <1>;
|
||||
};
|
||||
|
||||
...
|
@@ -1,37 +0,0 @@
|
||||
Allwinner Memory Bus (MBUS) controller
|
||||
|
||||
The MBUS controller drives the MBUS that other devices in the SoC will
|
||||
use to perform DMA. It also has a register interface that allows to
|
||||
monitor and control the bandwidth and priorities for masters on that
|
||||
bus.
|
||||
|
||||
Required properties:
|
||||
- compatible: Must be one of:
|
||||
- allwinner,sun5i-a13-mbus
|
||||
- allwinner,sun8i-h3-mbus
|
||||
- reg: Offset and length of the register set for the controller
|
||||
- clocks: phandle to the clock driving the controller
|
||||
- dma-ranges: See section 2.3.9 of the DeviceTree Specification
|
||||
- #interconnect-cells: Must be one, with the argument being the MBUS
|
||||
port ID
|
||||
|
||||
Each device having to perform their DMA through the MBUS must have the
|
||||
interconnects and interconnect-names properties set to the MBUS
|
||||
controller and with "dma-mem" as the interconnect name.
|
||||
|
||||
Example:
|
||||
|
||||
mbus: dram-controller@1c01000 {
|
||||
compatible = "allwinner,sun5i-a13-mbus";
|
||||
reg = <0x01c01000 0x1000>;
|
||||
clocks = <&ccu CLK_MBUS>;
|
||||
dma-ranges = <0x00000000 0x40000000 0x20000000>;
|
||||
#interconnect-cells = <1>;
|
||||
};
|
||||
|
||||
fe0: display-frontend@1e00000 {
|
||||
compatible = "allwinner,sun5i-a13-display-frontend";
|
||||
...
|
||||
interconnects = <&mbus 19>;
|
||||
interconnect-names = "dma-mem";
|
||||
};
|
36
Documentation/devicetree/bindings/arm/ux500.yaml
Normal file
36
Documentation/devicetree/bindings/arm/ux500.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/arm/ux500.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Ux500 platforms device tree bindings
|
||||
|
||||
maintainers:
|
||||
- Linus Walleij <linus.walleij@linaro.org>
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
const: '/'
|
||||
compatible:
|
||||
oneOf:
|
||||
|
||||
- description: ST-Ericsson HREF (pre-v60)
|
||||
items:
|
||||
- const: st-ericsson,mop500
|
||||
- const: st-ericsson,u8500
|
||||
|
||||
- description: ST-Ericsson HREF (v60+)
|
||||
items:
|
||||
- const: st-ericsson,hrefv60+
|
||||
- const: st-ericsson,u8500
|
||||
|
||||
- description: Calao Systems Snowball
|
||||
items:
|
||||
- const: calaosystems,snowball-a9500
|
||||
- const: st-ericsson,u9500
|
||||
|
||||
- description: Samsung Galaxy S III mini (GT-I8190)
|
||||
items:
|
||||
- const: samsung,golden
|
||||
- const: st-ericsson,u8500
|
@@ -9,8 +9,6 @@ PHYs.
|
||||
|
||||
Required properties:
|
||||
- compatible : compatible string, one of:
|
||||
- "allwinner,sun4i-a10-ahci"
|
||||
- "allwinner,sun8i-r40-ahci"
|
||||
- "brcm,iproc-ahci"
|
||||
- "hisilicon,hisi-ahci"
|
||||
- "cavium,octeon-7130-ahci"
|
||||
@@ -45,8 +43,6 @@ Required properties when using sub-nodes:
|
||||
- #address-cells : number of cells to encode an address
|
||||
- #size-cells : number of cells representing the size of an address
|
||||
|
||||
For allwinner,sun8i-r40-ahci, the reset property must be present.
|
||||
|
||||
Sub-nodes required properties:
|
||||
- reg : the port number
|
||||
And at least one of the following properties:
|
||||
@@ -60,14 +56,6 @@ Examples:
|
||||
interrupts = <115>;
|
||||
};
|
||||
|
||||
ahci: sata@1c18000 {
|
||||
compatible = "allwinner,sun4i-a10-ahci";
|
||||
reg = <0x01c18000 0x1000>;
|
||||
interrupts = <56>;
|
||||
clocks = <&pll6 0>, <&ahb_gates 25>;
|
||||
target-supply = <®_ahci_5v>;
|
||||
};
|
||||
|
||||
With sub-nodes:
|
||||
sata@f7e90000 {
|
||||
compatible = "marvell,berlin2q-achi", "generic-ahci";
|
||||
|
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/ata/allwinner,sun4i-a10-ahci.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 AHCI SATA Controller bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: allwinner,sun4i-a10-ahci
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
items:
|
||||
- description: AHCI Bus Clock
|
||||
- description: AHCI Module Clock
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
target-supply:
|
||||
description: Regulator for SATA target power
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- interrupts
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
ahci: sata@1c18000 {
|
||||
compatible = "allwinner,sun4i-a10-ahci";
|
||||
reg = <0x01c18000 0x1000>;
|
||||
interrupts = <56>;
|
||||
clocks = <&pll6 0>, <&ahb_gates 25>;
|
||||
target-supply = <®_ahci_5v>;
|
||||
};
|
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/ata/allwinner,sun8i-r40-ahci.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner R40 AHCI SATA Controller bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
const: allwinner,sun8i-r40-ahci
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
items:
|
||||
- description: AHCI Bus Clock
|
||||
- description: AHCI Module Clock
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
resets:
|
||||
maxItems: 1
|
||||
|
||||
reset-names:
|
||||
const: ahci
|
||||
|
||||
ahci-supply:
|
||||
description: Regulator for the AHCI controller
|
||||
|
||||
phy-supply:
|
||||
description: Regulator for the SATA PHY power
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- interrupts
|
||||
- resets
|
||||
- reset-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||
#include <dt-bindings/clock/sun8i-r40-ccu.h>
|
||||
#include <dt-bindings/reset/sun8i-r40-ccu.h>
|
||||
|
||||
ahci: sata@1c18000 {
|
||||
compatible = "allwinner,sun8i-r40-ahci";
|
||||
reg = <0x01c18000 0x1000>;
|
||||
interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&ccu CLK_BUS_SATA>, <&ccu CLK_SATA>;
|
||||
resets = <&ccu RST_BUS_SATA>;
|
||||
reset-names = "ahci";
|
||||
ahci-supply = <®_dldo4>;
|
||||
phy-supply = <®_eldo3>;
|
||||
};
|
||||
|
||||
...
|
@@ -5,6 +5,7 @@ Each SATA controller should have its own node.
|
||||
|
||||
Required properties:
|
||||
- compatible : should be one or more of
|
||||
"brcm,bcm7216-ahci"
|
||||
"brcm,bcm7425-ahci"
|
||||
"brcm,bcm7445-ahci"
|
||||
"brcm,bcm-nsp-ahci"
|
||||
@@ -14,6 +15,12 @@ Required properties:
|
||||
- reg-names : "ahci" and "top-ctrl"
|
||||
- interrupts : interrupt mapping for SATA IRQ
|
||||
|
||||
Optional properties:
|
||||
|
||||
- reset: for "brcm,bcm7216-ahci" must be a valid reset phandle
|
||||
pointing to the RESCAL reset controller provider node.
|
||||
- reset-names: for "brcm,bcm7216-ahci", must be "rescal".
|
||||
|
||||
Also see ahci-platform.txt.
|
||||
|
||||
Example:
|
||||
|
@@ -1,38 +0,0 @@
|
||||
* Faraday Technology FTIDE010 PATA controller
|
||||
|
||||
This controller is the first Faraday IDE interface block, used in the
|
||||
StorLink SL2312 and SL3516, later known as the Cortina Systems Gemini
|
||||
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
|
||||
(MWDM)modes 0 through 2 and Ultra DMA modes 0 through 6.
|
||||
|
||||
On the Gemini platform, this PATA block is accompanied by a PATA to
|
||||
SATA bridge in order to support SATA. This is why a phandle to that
|
||||
controller is compulsory on that platform.
|
||||
|
||||
The timing properties are unique per-SoC, not per-board.
|
||||
|
||||
Required properties:
|
||||
- compatible: should be one of
|
||||
"cortina,gemini-pata", "faraday,ftide010"
|
||||
"faraday,ftide010"
|
||||
- interrupts: interrupt for the block
|
||||
- reg: registers and size for the block
|
||||
|
||||
Optional properties:
|
||||
- clocks: a SoC clock running the peripheral.
|
||||
- clock-names: should be set to "PCLK" for the peripheral clock.
|
||||
|
||||
Required properties for "cortina,gemini-pata" compatible:
|
||||
- sata: a phande to the Gemini PATA to SATA bridge, see
|
||||
cortina,gemini-sata-bridge.txt for details.
|
||||
|
||||
Example:
|
||||
|
||||
ata@63000000 {
|
||||
compatible = "cortina,gemini-pata", "faraday,ftide010";
|
||||
reg = <0x63000000 0x100>;
|
||||
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
|
||||
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
|
||||
clock-names = "PCLK";
|
||||
sata = <&sata>;
|
||||
};
|
89
Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
Normal file
89
Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/ata/faraday,ftide010.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Faraday Technology FTIDE010 PATA controller
|
||||
|
||||
maintainers:
|
||||
- Linus Walleij <linus.walleij@linaro.org>
|
||||
|
||||
description: |
|
||||
This controller is the first Faraday IDE interface block, used in the
|
||||
StorLink SL3512 and SL3516, later known as the Cortina Systems Gemini
|
||||
platform. The controller can do PIO modes 0 through 4, Multi-word DMA
|
||||
(MWDM) modes 0 through 2 and Ultra DMA modes 0 through 6.
|
||||
|
||||
On the Gemini platform, this PATA block is accompanied by a PATA to
|
||||
SATA bridge in order to support SATA. This is why a phandle to that
|
||||
controller is compulsory on that platform.
|
||||
|
||||
The timing properties are unique per-SoC, not per-board.
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
oneOf:
|
||||
- const: faraday,ftide010
|
||||
- items:
|
||||
- const: cortina,gemini-pata
|
||||
- const: faraday,ftide010
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
minItems: 1
|
||||
|
||||
clock-names:
|
||||
const: PCLK
|
||||
|
||||
sata:
|
||||
description:
|
||||
phandle to the Gemini PATA to SATA bridge, if available
|
||||
$ref: /schemas/types.yaml#/definitions/phandle
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- reg
|
||||
- interrupts
|
||||
|
||||
allOf:
|
||||
- $ref: pata-common.yaml#
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: cortina,gemini-pata
|
||||
|
||||
then:
|
||||
required:
|
||||
- sata
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/interrupt-controller/irq.h>
|
||||
#include <dt-bindings/clock/cortina,gemini-clock.h>
|
||||
|
||||
ide@63000000 {
|
||||
compatible = "cortina,gemini-pata", "faraday,ftide010";
|
||||
reg = <0x63000000 0x100>;
|
||||
interrupts = <4 IRQ_TYPE_EDGE_RISING>;
|
||||
clocks = <&gcc GEMINI_CLK_GATE_IDE>;
|
||||
clock-names = "PCLK";
|
||||
sata = <&sata>;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
ide-port@0 {
|
||||
reg = <0>;
|
||||
};
|
||||
ide-port@1 {
|
||||
reg = <1>;
|
||||
};
|
||||
};
|
||||
|
||||
...
|
50
Documentation/devicetree/bindings/ata/pata-common.yaml
Normal file
50
Documentation/devicetree/bindings/ata/pata-common.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/ata/pata-common.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Common Properties for Parallel AT attachment (PATA) controllers
|
||||
|
||||
maintainers:
|
||||
- Linus Walleij <linus.walleij@linaro.org>
|
||||
|
||||
description: |
|
||||
This document defines device tree properties common to most Parallel
|
||||
ATA (PATA, also known as IDE) AT attachment storage devices.
|
||||
It doesn't constitue a device tree binding specification by itself but is
|
||||
meant to be referenced by device tree bindings.
|
||||
|
||||
The PATA (IDE) controller-specific device tree bindings are responsible for
|
||||
defining whether each property is required or optional.
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
pattern: "^ide(@.*)?$"
|
||||
description:
|
||||
Specifies the host controller node. PATA host controller nodes are named
|
||||
"ide".
|
||||
|
||||
"#address-cells":
|
||||
const: 1
|
||||
|
||||
"#size-cells":
|
||||
const: 0
|
||||
|
||||
patternProperties:
|
||||
"^ide-port@[0-1]$":
|
||||
description: |
|
||||
DT nodes for ports connected on the PATA host. The master drive will have
|
||||
ID number 0 and the slave drive will have ID number 1. The PATA port
|
||||
nodes will be named "ide-port".
|
||||
type: object
|
||||
|
||||
properties:
|
||||
reg:
|
||||
minimum: 0
|
||||
maximum: 1
|
||||
description:
|
||||
The ID number of the drive port, 0 for the master port and 1 for the
|
||||
slave port.
|
||||
|
||||
...
|
50
Documentation/devicetree/bindings/ata/sata-common.yaml
Normal file
50
Documentation/devicetree/bindings/ata/sata-common.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/ata/sata-common.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Common Properties for Serial AT attachment (SATA) controllers
|
||||
|
||||
maintainers:
|
||||
- Linus Walleij <linus.walleij@linaro.org>
|
||||
|
||||
description: |
|
||||
This document defines device tree properties common to most Serial
|
||||
AT attachment (SATA) storage devices. It doesn't constitute a device tree
|
||||
binding specification by itself but is meant to be referenced by device
|
||||
tree bindings.
|
||||
|
||||
The SATA controller-specific device tree bindings are responsible for
|
||||
defining whether each property is required or optional.
|
||||
|
||||
properties:
|
||||
$nodename:
|
||||
pattern: "^sata(@.*)?$"
|
||||
description:
|
||||
Specifies the host controller node. SATA host controller nodes are named
|
||||
"sata"
|
||||
|
||||
"#address-cells":
|
||||
const: 1
|
||||
|
||||
"#size-cells":
|
||||
const: 0
|
||||
|
||||
patternProperties:
|
||||
"^sata-port@[0-9a-e]$":
|
||||
description: |
|
||||
DT nodes for ports connected on the SATA host. The SATA port
|
||||
nodes will be named "sata-port".
|
||||
type: object
|
||||
|
||||
properties:
|
||||
reg:
|
||||
minimum: 0
|
||||
maximum: 14
|
||||
description:
|
||||
The ID number of the drive port SATA can potentially use a port
|
||||
multiplier making it possible to connect up to 15 disks to a single
|
||||
SATA port.
|
||||
|
||||
...
|
@@ -0,0 +1,108 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ahb-clk.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 AHB Clock Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
deprecated: true
|
||||
|
||||
properties:
|
||||
"#clock-cells":
|
||||
const: 0
|
||||
|
||||
compatible:
|
||||
enum:
|
||||
- allwinner,sun4i-a10-ahb-clk
|
||||
- allwinner,sun6i-a31-ahb1-clk
|
||||
- allwinner,sun8i-h3-ahb2-clk
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
minItems: 1
|
||||
maxItems: 4
|
||||
description: >
|
||||
The parent order must match the hardware programming order.
|
||||
|
||||
clock-output-names:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- "#clock-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-output-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
allOf:
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: allwinner,sun4i-a10-ahb-clk
|
||||
|
||||
then:
|
||||
properties:
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: allwinner,sun6i-a31-ahb1-clk
|
||||
|
||||
then:
|
||||
properties:
|
||||
clocks:
|
||||
maxItems: 4
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: allwinner,sun8i-h3-ahb2-clk
|
||||
|
||||
then:
|
||||
properties:
|
||||
clocks:
|
||||
maxItems: 2
|
||||
|
||||
examples:
|
||||
- |
|
||||
ahb@1c20054 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-ahb-clk";
|
||||
reg = <0x01c20054 0x4>;
|
||||
clocks = <&axi>;
|
||||
clock-output-names = "ahb";
|
||||
};
|
||||
|
||||
- |
|
||||
ahb1@1c20054 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun6i-a31-ahb1-clk";
|
||||
reg = <0x01c20054 0x4>;
|
||||
clocks = <&osc32k>, <&osc24M>, <&axi>, <&pll6 0>;
|
||||
clock-output-names = "ahb1";
|
||||
};
|
||||
|
||||
- |
|
||||
ahb2_clk@1c2005c {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun8i-h3-ahb2-clk";
|
||||
reg = <0x01c2005c 0x4>;
|
||||
clocks = <&ahb1>, <&pll6d2>;
|
||||
clock-output-names = "ahb2";
|
||||
};
|
||||
|
||||
...
|
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb0-clk.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 APB0 Bus Clock Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
deprecated: true
|
||||
|
||||
properties:
|
||||
"#clock-cells":
|
||||
const: 0
|
||||
|
||||
compatible:
|
||||
const: allwinner,sun4i-a10-apb0-clk
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
clock-output-names:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- "#clock-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-output-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
apb0@1c20054 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-apb0-clk";
|
||||
reg = <0x01c20054 0x4>;
|
||||
clocks = <&ahb>;
|
||||
clock-output-names = "apb0";
|
||||
};
|
||||
|
||||
...
|
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb1-clk.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 APB1 Bus Clock Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
deprecated: true
|
||||
|
||||
properties:
|
||||
"#clock-cells":
|
||||
const: 0
|
||||
|
||||
compatible:
|
||||
const: allwinner,sun4i-a10-apb1-clk
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 3
|
||||
description: >
|
||||
The parent order must match the hardware programming order.
|
||||
|
||||
clock-output-names:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- "#clock-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-output-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
clk@1c20058 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-apb1-clk";
|
||||
reg = <0x01c20058 0x4>;
|
||||
clocks = <&osc24M>, <&pll6 1>, <&osc32k>;
|
||||
clock-output-names = "apb1";
|
||||
};
|
||||
|
||||
...
|
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-axi-clk.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 AXI Clock Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
deprecated: true
|
||||
|
||||
properties:
|
||||
"#clock-cells":
|
||||
const: 0
|
||||
|
||||
compatible:
|
||||
enum:
|
||||
- allwinner,sun4i-a10-axi-clk
|
||||
- allwinner,sun8i-a23-axi-clk
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
clock-output-names:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- "#clock-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-output-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
axi@1c20054 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-axi-clk";
|
||||
reg = <0x01c20054 0x4>;
|
||||
clocks = <&cpu>;
|
||||
clock-output-names = "axi";
|
||||
};
|
||||
|
||||
- |
|
||||
axi_clk@1c20050 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun8i-a23-axi-clk";
|
||||
reg = <0x01c20050 0x4>;
|
||||
clocks = <&cpu>;
|
||||
clock-output-names = "axi";
|
||||
};
|
||||
|
||||
...
|
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-cpu-clk.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Allwinner A10 CPU Clock Device Tree Bindings
|
||||
|
||||
maintainers:
|
||||
- Chen-Yu Tsai <wens@csie.org>
|
||||
- Maxime Ripard <mripard@kernel.org>
|
||||
|
||||
deprecated: true
|
||||
|
||||
properties:
|
||||
"#clock-cells":
|
||||
const: 0
|
||||
|
||||
compatible:
|
||||
const: allwinner,sun4i-a10-cpu-clk
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
clocks:
|
||||
maxItems: 4
|
||||
description: >
|
||||
The parent order must match the hardware programming order.
|
||||
|
||||
clock-output-names:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- "#clock-cells"
|
||||
- compatible
|
||||
- reg
|
||||
- clocks
|
||||
- clock-output-names
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
cpu@1c20054 {
|
||||
#clock-cells = <0>;
|
||||
compatible = "allwinner,sun4i-a10-cpu-clk";
|
||||
reg = <0x01c20054 0x4>;
|
||||
clocks = <&osc32k>, <&osc24M>, <&pll1>, <&dummy>;
|
||||
clock-output-names = "cpu";
|
||||
};
|
||||
|
||||
...
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user