mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 04:33:26 +02:00
Merge branch 'tbsdtv_linux_media/master' into tbsdtv_linux_media/latest
This commit is contained in:
@@ -226,7 +226,6 @@ ForEachMacros:
|
||||
- 'for_each_console_srcu'
|
||||
- 'for_each_cpu'
|
||||
- 'for_each_cpu_and'
|
||||
- 'for_each_cpu_not'
|
||||
- 'for_each_cpu_wrap'
|
||||
- 'for_each_dapm_widgets'
|
||||
- 'for_each_dedup_cand'
|
||||
|
8
.gitattributes
vendored
8
.gitattributes
vendored
@@ -1,4 +1,4 @@
|
||||
*.c diff=cpp
|
||||
*.h diff=cpp
|
||||
*.dtsi diff=dts
|
||||
*.dts diff=dts
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
*.[ch] diff=cpp
|
||||
*.dts diff=dts
|
||||
*.dts[io] diff=dts
|
||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -4,7 +4,7 @@
|
||||
# subdirectories here. Add them in the ".gitignore" file
|
||||
# in that subdirectory instead.
|
||||
#
|
||||
# NOTE! Please use 'git ls-files -i --exclude-standard'
|
||||
# NOTE! Please use 'git ls-files -i -c --exclude-per-directory=.gitignore'
|
||||
# command after changing this file, to see if there are
|
||||
# any tracked files which get ignored after the change.
|
||||
#
|
||||
@@ -16,6 +16,7 @@
|
||||
*.bin
|
||||
*.bz2
|
||||
*.c.[012]*.*
|
||||
*.cover
|
||||
*.dt.yaml
|
||||
*.dtb
|
||||
*.dtbo
|
||||
@@ -33,6 +34,7 @@
|
||||
*.lz4
|
||||
*.lzma
|
||||
*.lzo
|
||||
*.mbx
|
||||
*.mod
|
||||
*.mod.c
|
||||
*.o
|
||||
|
15
.mailmap
15
.mailmap
@@ -25,6 +25,8 @@ Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
||||
Alexander Lobakin <alobakin@pm.me> <alobakin@dlink.ru>
|
||||
Alexander Lobakin <alobakin@pm.me> <alobakin@marvell.com>
|
||||
Alexander Lobakin <alobakin@pm.me> <bloodyreaper@yandex.ru>
|
||||
Alexander Mikhalitsyn <alexander@mihalicyn.com> <alexander.mikhalitsyn@virtuozzo.com>
|
||||
Alexander Mikhalitsyn <alexander@mihalicyn.com> <aleksandr.mikhalitsyn@canonical.com>
|
||||
Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||
@@ -119,6 +121,7 @@ Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||
<dev.kurt@vandijck-laurijssen.be> <kurt.van.dijck@eia.be>
|
||||
Dikshita Agarwal <dikshita@qti.qualcomm.com> <dikshita@codeaurora.org>
|
||||
Dmitry Baryshkov <dbaryshkov@gmail.com>
|
||||
Dmitry Baryshkov <dbaryshkov@gmail.com> <[dbaryshkov@gmail.com]>
|
||||
Dmitry Baryshkov <dbaryshkov@gmail.com> <dmitry_baryshkov@mentor.com>
|
||||
@@ -133,6 +136,9 @@ Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
|
||||
Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com>
|
||||
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
|
||||
Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com>
|
||||
Faith Ekstrand <faith.ekstrand@collabora.com> <jason@jlekstrand.net>
|
||||
Faith Ekstrand <faith.ekstrand@collabora.com> <jason.ekstrand@intel.com>
|
||||
Faith Ekstrand <faith.ekstrand@collabora.com> <jason.ekstrand@collabora.com>
|
||||
Felipe W Damasio <felipewd@terra.com.br>
|
||||
Felix Kuhling <fxkuehl@gmx.de>
|
||||
Felix Moeller <felix@derklecks.de>
|
||||
@@ -148,6 +154,7 @@ Gao Xiang <xiang@kernel.org> <gaoxiang25@huawei.com>
|
||||
Gao Xiang <xiang@kernel.org> <hsiangkao@aol.com>
|
||||
Gao Xiang <xiang@kernel.org> <hsiangkao@linux.alibaba.com>
|
||||
Gao Xiang <xiang@kernel.org> <hsiangkao@redhat.com>
|
||||
Georgi Djakov <djakov@kernel.org> <georgi.djakov@linaro.org>
|
||||
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@de.ibm.com>
|
||||
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <gerald.schaefer@de.ibm.com>
|
||||
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@linux.vnet.ibm.com>
|
||||
@@ -302,6 +309,8 @@ Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@osg.samsung.com>
|
||||
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@redhat.com>
|
||||
Mauro Carvalho Chehab <mchehab@kernel.org> <m.chehab@samsung.com>
|
||||
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@s-opensource.com>
|
||||
Maxim Mikityanskiy <maxtram95@gmail.com> <maximmi@mellanox.com>
|
||||
Maxim Mikityanskiy <maxtram95@gmail.com> <maximmi@nvidia.com>
|
||||
Maxime Ripard <mripard@kernel.org> <maxime.ripard@bootlin.com>
|
||||
Maxime Ripard <mripard@kernel.org> <maxime.ripard@free-electrons.com>
|
||||
Mayuresh Janorkar <mayur@ti.com>
|
||||
@@ -407,7 +416,10 @@ Shuah Khan <shuah@kernel.org> <shuah.kh@samsung.com>
|
||||
Simon Arlott <simon@octiron.net> <simon@fire.lp0.eu>
|
||||
Simon Kelley <simon@thekelleys.org.uk>
|
||||
Stéphane Witzmann <stephane.witzmann@ubpmes.univ-bpclermont.fr>
|
||||
Stephen Hemminger <shemminger@osdl.org>
|
||||
Stephen Hemminger <stephen@networkplumber.org> <shemminger@linux-foundation.org>
|
||||
Stephen Hemminger <stephen@networkplumber.org> <shemminger@osdl.org>
|
||||
Stephen Hemminger <stephen@networkplumber.org> <sthemmin@microsoft.com>
|
||||
Stephen Hemminger <stephen@networkplumber.org> <sthemmin@vyatta.com>
|
||||
Steve Wise <larrystevenwise@gmail.com> <swise@chelsio.com>
|
||||
Steve Wise <larrystevenwise@gmail.com> <swise@opengridcomputing.com>
|
||||
Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
|
||||
@@ -439,6 +451,7 @@ Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org>
|
||||
Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com>
|
||||
Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru>
|
||||
Valentin Schneider <vschneid@redhat.com> <valentin.schneider@arm.com>
|
||||
Vikash Garodia <quic_vgarodia@quicinc.com> <vgarodia@codeaurora.org>
|
||||
Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com>
|
||||
Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com>
|
||||
Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>
|
||||
|
6
CREDITS
6
CREDITS
@@ -1852,11 +1852,11 @@ E: ajoshi@shell.unixbox.com
|
||||
D: fbdev hacking
|
||||
|
||||
N: Jesper Juhl
|
||||
E: jj@chaosbits.net
|
||||
E: jesperjuhl76@gmail.com
|
||||
D: Various fixes, cleanups and minor features all over the tree.
|
||||
D: Wrote initial version of the hdaps driver (since passed on to others).
|
||||
S: Lemnosvej 1, 3.tv
|
||||
S: 2300 Copenhagen S.
|
||||
S: Titangade 5G, 2.tv
|
||||
S: 2200 Copenhagen N.
|
||||
S: Denmark
|
||||
|
||||
N: Jozsef Kadlecsik
|
||||
|
@@ -432,7 +432,8 @@ Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RW] This is the maximum number of kilobytes that the block
|
||||
layer will allow for a filesystem request. Must be smaller than
|
||||
or equal to the maximum size allowed by the hardware.
|
||||
or equal to the maximum size allowed by the hardware. Write 0
|
||||
to use default kernel settings.
|
||||
|
||||
|
||||
What: /sys/block/<disk>/queue/max_segment_size
|
||||
@@ -704,6 +705,15 @@ Description:
|
||||
zoned will report "none".
|
||||
|
||||
|
||||
What: /sys/block/<disk>/hidden
|
||||
Date: March 2023
|
||||
Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RO] the block device is hidden. it doesn’t produce events, and
|
||||
can’t be opened from userspace or using blkdev_get*.
|
||||
Used for the underlying components of multipath devices.
|
||||
|
||||
|
||||
What: /sys/block/<disk>/stat
|
||||
Date: February 2008
|
||||
Contact: Jerome Marchand <jmarchan@redhat.com>
|
||||
|
@@ -182,3 +182,42 @@ Date: November 2021
|
||||
Contact: Jarkko Sakkinen <jarkko@kernel.org>
|
||||
Description:
|
||||
The total amount of SGX physical memory in bytes.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_failure/total
|
||||
Date: January 2023
|
||||
Contact: Jiaqi Yan <jiaqiyan@google.com>
|
||||
Description:
|
||||
The total number of raw poisoned pages (pages containing
|
||||
corrupted data due to memory errors) on a NUMA node.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_failure/ignored
|
||||
Date: January 2023
|
||||
Contact: Jiaqi Yan <jiaqiyan@google.com>
|
||||
Description:
|
||||
Of the raw poisoned pages on a NUMA node, how many pages are
|
||||
ignored by memory error recovery attempt, usually because
|
||||
support for this type of pages is unavailable, and kernel
|
||||
gives up the recovery.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_failure/failed
|
||||
Date: January 2023
|
||||
Contact: Jiaqi Yan <jiaqiyan@google.com>
|
||||
Description:
|
||||
Of the raw poisoned pages on a NUMA node, how many pages are
|
||||
failed by memory error recovery attempt. This usually means
|
||||
a key recovery operation failed.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_failure/delayed
|
||||
Date: January 2023
|
||||
Contact: Jiaqi Yan <jiaqiyan@google.com>
|
||||
Description:
|
||||
Of the raw poisoned pages on a NUMA node, how many pages are
|
||||
delayed by memory error recovery attempt. Delayed poisoned
|
||||
pages usually will be retried by kernel.
|
||||
|
||||
What: /sys/devices/system/node/nodeX/memory_failure/recovered
|
||||
Date: January 2023
|
||||
Contact: Jiaqi Yan <jiaqiyan@google.com>
|
||||
Description:
|
||||
Of the raw poisoned pages on a NUMA node, how many pages are
|
||||
recovered by memory error recovery attempt.
|
||||
|
@@ -522,7 +522,6 @@ Description: These files allow to each of ASICs by writing 1.
|
||||
|
||||
The files are write only.
|
||||
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/comm_chnl_ready
|
||||
Date: July 2022
|
||||
KernelVersion: 5.20
|
||||
@@ -542,3 +541,124 @@ Description: The file indicates COME module hardware configuration.
|
||||
The purpose is to expose some minor BOM changes for the same system SKU.
|
||||
|
||||
The file is read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_pwr_converter_fail
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: This file shows the system reset cause due to power converter
|
||||
devices failure.
|
||||
Value 1 in file means this is reset cause, 0 - otherwise.
|
||||
|
||||
The file is read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_ap_reset
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_ap_reset
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: These files aim to monitor the status of the External Root of Trust (EROT)
|
||||
processor's RESET output to the Application Processor (AP).
|
||||
By reading this file, could be determined if the EROT has invalidated or
|
||||
revoked AP Firmware, at which point it will hold the AP in RESET until a
|
||||
valid firmware is loaded. This protects the AP from running an
|
||||
unauthorized firmware. In the normal flow, the AP reset should be released
|
||||
after the EROT validates the integrity of the FW, and it should be done so
|
||||
as quickly as possible so that the AP boots before the CPU starts to
|
||||
communicate to each ASIC.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_recovery
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_recovery
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_reset
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_reset
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: These files aim to perform External Root of Trust (EROT) recovery
|
||||
sequence after EROT device failure.
|
||||
These EROT devices protect ASICs from unauthorized access and in normal
|
||||
flow their reset should be released with system power – earliest power
|
||||
up stage, so that EROTs can begin boot and authentication process before
|
||||
CPU starts to communicate to ASICs.
|
||||
Issuing a reset to the EROT while asserting the recovery signal will cause
|
||||
the EROT Application Processor to enter recovery mode so that the EROT FW
|
||||
can be updated/recovered.
|
||||
For reset/recovery the related file should be toggled by 1/0.
|
||||
|
||||
The files are read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_wp
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_wp
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: These files allow access to External Root of Trust (EROT) for reset
|
||||
and recovery sequence after EROT device failure.
|
||||
Default is 0 (programming disabled).
|
||||
If the system is in locked-down mode writing this file will not be allowed.
|
||||
|
||||
The files are read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/spi_chnl_select
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: This file allows SPI chip selection for External Root of Trust (EROT)
|
||||
device Out-of-Band recovery.
|
||||
File can be written with 0 or with 1. It selects which EROT can be accessed
|
||||
through SPI device.
|
||||
|
||||
The file is read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_pg_fail
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak vadimp@nvidia.com
|
||||
Description: This file shows ASIC Power Good status.
|
||||
Value 1 in file means ASIC Power Good failed, 0 - otherwise.
|
||||
|
||||
The file is read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd1_boot_fail
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd2_boot_fail
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd_fail
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak vadimp@nvidia.com
|
||||
Description: These files are related to clock boards status in system.
|
||||
- clk_brd1_boot_fail: warning about 1-st clock board failed to boot from CI.
|
||||
- clk_brd2_boot_fail: warning about 2-nd clock board failed to boot from CI.
|
||||
- clk_brd_fail: error about common clock board boot failure.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd_prog_en
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: This file enables programming of clock boards.
|
||||
Default is 0 (programming disabled).
|
||||
If the system is in locked-down mode writing this file will not be allowed.
|
||||
|
||||
The file is read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pwr_converter_prog_en
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: This file enables programming of power converters.
|
||||
Default is 0 (programming disabled).
|
||||
If the system is in locked-down mode writing this file will not be allowed.
|
||||
|
||||
The file is read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_ac_ok_fail
|
||||
Date: February 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Vadim Pasternak <vadimp@nvidia.com>
|
||||
Description: This file shows the system reset cause due to AC power failure.
|
||||
Value 1 in file means this is reset cause, 0 - otherwise.
|
||||
|
||||
The file is read only.
|
||||
|
@@ -120,3 +120,16 @@ Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
The Xen version is in the format <major>.<minor><extra>
|
||||
This is the <minor> part of it.
|
||||
|
||||
What: /sys/hypervisor/start_flags/*
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3.0
|
||||
Contact: xen-devel@lists.xenproject.org
|
||||
Description: If running under Xen:
|
||||
All bits in Xen's start-flags are represented as
|
||||
boolean files, returning '1' if set, '0' otherwise.
|
||||
This takes the place of the defunct /proc/xen/capabilities,
|
||||
which would contain "control_d" on dom0, and be empty
|
||||
otherwise. This flag is now exposed as "initdomain" in
|
||||
addition to the "privileged" flag; all other possible flags
|
||||
are accessible as "unknownXX".
|
||||
|
@@ -143,3 +143,16 @@ Description:
|
||||
qw_sign an identifier to be reported as "OS String"
|
||||
proper
|
||||
============= ===============================================
|
||||
|
||||
What: /config/usb-gadget/gadget/webusb
|
||||
Date: Dec 2022
|
||||
KernelVersion: 6.3
|
||||
Description:
|
||||
This group contains "WebUSB" extension handling attributes.
|
||||
|
||||
============= ===============================================
|
||||
use flag turning "WebUSB" support on/off
|
||||
bcdVersion bcd WebUSB specification version number
|
||||
bVendorCode one-byte value used for custom per-device
|
||||
landingPage UTF-8 encoded URL of the device's landing page
|
||||
============= ===============================================
|
||||
|
@@ -15,12 +15,14 @@ Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
Description: Control descriptors
|
||||
|
||||
All attributes read only:
|
||||
All attributes read only except enable_interrupt_ep:
|
||||
|
||||
================ =============================
|
||||
=================== =============================
|
||||
bInterfaceNumber USB interface number for this
|
||||
streaming interface
|
||||
================ =============================
|
||||
enable_interrupt_ep flag to enable the interrupt
|
||||
endpoint for the VC interface
|
||||
=================== =============================
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/control/class
|
||||
Date: Dec 2014
|
||||
@@ -52,7 +54,7 @@ Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
Description: Default output terminal descriptors
|
||||
|
||||
All attributes read only:
|
||||
All attributes read only except bSourceID:
|
||||
|
||||
============== =============================================
|
||||
iTerminal index of string descriptor
|
||||
@@ -111,6 +113,34 @@ Description: Default processing unit descriptors
|
||||
bUnitID a non-zero id of this unit
|
||||
=============== ========================================
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/control/extensions
|
||||
Date: Nov 2022
|
||||
KernelVersion: 6.1
|
||||
Description: Extension unit descriptors
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/control/extensions/name
|
||||
Date: Nov 2022
|
||||
KernelVersion: 6.1
|
||||
Description: Extension Unit (XU) Descriptor
|
||||
|
||||
bLength, bUnitID and iExtension are read-only. All others are
|
||||
read-write.
|
||||
|
||||
================= ========================================
|
||||
bLength size of the descriptor in bytes
|
||||
bUnitID non-zero ID of this unit
|
||||
guidExtensionCode Vendor-specific code identifying the XU
|
||||
bNumControls number of controls in this XU
|
||||
bNrInPins number of input pins for this unit
|
||||
baSourceID list of the IDs of the units or terminals
|
||||
to which this XU is connected
|
||||
bControlSize size of the bmControls field in bytes
|
||||
bmControls list of bitmaps detailing which vendor
|
||||
specific controls are supported
|
||||
iExtension index of a string descriptor that describes
|
||||
this extension unit
|
||||
================= ========================================
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/control/header
|
||||
Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
@@ -165,7 +195,24 @@ Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
Description: Default color matching descriptors
|
||||
|
||||
All attributes read only:
|
||||
All attributes read/write:
|
||||
|
||||
======================== ======================================
|
||||
bMatrixCoefficients matrix used to compute luma and
|
||||
chroma values from the color primaries
|
||||
bTransferCharacteristics optoelectronic transfer
|
||||
characteristic of the source picture,
|
||||
also called the gamma function
|
||||
bColorPrimaries color primaries and the reference
|
||||
white
|
||||
======================== ======================================
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/streaming/color_matching/name
|
||||
Date: Dec 2022
|
||||
KernelVersion: 6.3
|
||||
Description: Additional color matching descriptors
|
||||
|
||||
All attributes read/write:
|
||||
|
||||
======================== ======================================
|
||||
bMatrixCoefficients matrix used to compute luma and
|
||||
|
127
Documentation/ABI/testing/debugfs-driver-dcc
Normal file
127
Documentation/ABI/testing/debugfs-driver-dcc
Normal file
@@ -0,0 +1,127 @@
|
||||
What: /sys/kernel/debug/dcc/.../ready
|
||||
Date: December 2022
|
||||
Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
|
||||
Description:
|
||||
This file is used to check the status of the dcc
|
||||
hardware if it's ready to receive user configurations.
|
||||
A 'Y' here indicates dcc is ready.
|
||||
|
||||
What: /sys/kernel/debug/dcc/.../trigger
|
||||
Date: December 2022
|
||||
Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
|
||||
Description:
|
||||
This is the debugfs interface for manual software
|
||||
triggers. The trigger can be invoked by writing '1'
|
||||
to the file.
|
||||
|
||||
What: /sys/kernel/debug/dcc/.../config_reset
|
||||
Date: December 2022
|
||||
Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
|
||||
Description:
|
||||
This file is used to reset the configuration of
|
||||
a dcc driver to the default configuration. When '1'
|
||||
is written to the file, all the previous addresses
|
||||
stored in the driver gets removed and users need to
|
||||
reconfigure addresses again.
|
||||
|
||||
What: /sys/kernel/debug/dcc/.../[list-number]/config
|
||||
Date: December 2022
|
||||
Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
|
||||
Description:
|
||||
This stores the addresses of the registers which
|
||||
can be read in case of a hardware crash or manual
|
||||
software triggers. The input addresses type
|
||||
can be one of following dcc instructions: read,
|
||||
write, read-write, and loop type. The lists need to
|
||||
be configured sequentially and not in a overlapping
|
||||
manner; e.g. users can jump to list x only after
|
||||
list y is configured and enabled. The input format for
|
||||
each type is as follows:
|
||||
|
||||
i) Read instruction
|
||||
|
||||
::
|
||||
|
||||
echo R <addr> <n> <bus> >/sys/kernel/debug/dcc/../[list-number]/config
|
||||
|
||||
where:
|
||||
|
||||
<addr>
|
||||
The address to be read.
|
||||
|
||||
<n>
|
||||
The addresses word count, starting from address <1>.
|
||||
Each word is 32 bits (4 bytes). If omitted, defaulted
|
||||
to 1.
|
||||
|
||||
<bus type>
|
||||
The bus type, which can be either 'apb' or 'ahb'.
|
||||
The default is 'ahb' if leaved out.
|
||||
|
||||
ii) Write instruction
|
||||
|
||||
::
|
||||
|
||||
echo W <addr> <n> <bus type> > /sys/kernel/debug/dcc/../[list-number]/config
|
||||
|
||||
where:
|
||||
|
||||
<addr>
|
||||
The address to be written.
|
||||
|
||||
<n>
|
||||
The value to be written at <addr>.
|
||||
|
||||
<bus type>
|
||||
The bus type, which can be either 'apb' or 'ahb'.
|
||||
|
||||
iii) Read-write instruction
|
||||
|
||||
::
|
||||
|
||||
echo RW <addr> <n> <mask> > /sys/kernel/debug/dcc/../[list-number]/config
|
||||
|
||||
where:
|
||||
|
||||
<addr>
|
||||
The address to be read and written.
|
||||
|
||||
<n>
|
||||
The value to be written at <addr>.
|
||||
|
||||
<mask>
|
||||
The value mask.
|
||||
|
||||
iv) Loop instruction
|
||||
|
||||
::
|
||||
|
||||
echo L <loop count> <address count> <address>... > /sys/kernel/debug/dcc/../[list-number]/config
|
||||
|
||||
where:
|
||||
|
||||
<loop count>
|
||||
Number of iterations
|
||||
|
||||
<address count>
|
||||
total number of addresses to be written
|
||||
|
||||
<address>
|
||||
Space-separated list of addresses.
|
||||
|
||||
What: /sys/kernel/debug/dcc/.../[list-number]/enable
|
||||
Date: December 2022
|
||||
Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
|
||||
Description:
|
||||
This debugfs interface is used for enabling the
|
||||
the dcc hardware. A file named "enable" is in the
|
||||
directory list number where users can enable/disable
|
||||
the specific list by writing boolean (1 or 0) to the
|
||||
file.
|
||||
|
||||
On enabling the dcc, all the addresses specified
|
||||
by the user for the corresponding list is written
|
||||
into dcc sram which is read by the dcc hardware
|
||||
on manual or crash induced triggers. Lists must
|
||||
be configured and enabled sequentially, e.g. list
|
||||
2 can only be enabled when list 1 have so.
|
70
Documentation/ABI/testing/debugfs-scmi
Normal file
70
Documentation/ABI/testing/debugfs-scmi
Normal file
@@ -0,0 +1,70 @@
|
||||
What: /sys/kernel/debug/scmi/<n>/instance_name
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: The name of the underlying SCMI instance <n> described by
|
||||
all the debugfs accessors rooted at /sys/kernel/debug/scmi/<n>,
|
||||
expressed as the full name of the top DT SCMI node under which
|
||||
this SCMI instance is rooted.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/atomic_threshold_us
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: An optional time value, expressed in microseconds, representing,
|
||||
on this SCMI instance <n>, the threshold above which any SCMI
|
||||
command, advertised to have an higher-than-threshold execution
|
||||
latency, should not be considered for atomic mode of operation,
|
||||
even if requested.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/type
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: A string representing the type of transport configured for this
|
||||
SCMI instance <n>.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/is_atomic
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: A boolean stating if the transport configured on the underlying
|
||||
SCMI instance <n> is capable of atomic mode of operation.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/max_rx_timeout_ms
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: Timeout in milliseconds allowed for SCMI synchronous replies
|
||||
for the currently configured SCMI transport for instance <n>.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/max_msg_size
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: Max message size of allowed SCMI messages for the currently
|
||||
configured SCMI transport for instance <n>.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/tx_max_msg
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: Max number of concurrently allowed in-flight SCMI messages for
|
||||
the currently configured SCMI transport for instance <n> on the
|
||||
TX channels.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/transport/rx_max_msg
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: Max number of concurrently allowed in-flight SCMI messages for
|
||||
the currently configured SCMI transport for instance <n> on the
|
||||
RX channels.
|
||||
Users: Debugging, any userspace test suite
|
117
Documentation/ABI/testing/debugfs-scmi-raw
Normal file
117
Documentation/ABI/testing/debugfs-scmi-raw
Normal file
@@ -0,0 +1,117 @@
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/message
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw synchronous message injection/snooping facility; write
|
||||
a complete SCMI synchronous command message (header included)
|
||||
in little-endian binary format to have it sent to the configured
|
||||
backend SCMI server for instance <n>.
|
||||
Any subsequently received response can be read from this same
|
||||
entry if it arrived within the configured timeout.
|
||||
Each write to the entry causes one command request to be built
|
||||
and sent while the replies are read back one message at time
|
||||
(receiving an EOF at each message boundary).
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/message_async
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw asynchronous message injection/snooping facility; write
|
||||
a complete SCMI asynchronous command message (header included)
|
||||
in little-endian binary format to have it sent to the configured
|
||||
backend SCMI server for instance <n>.
|
||||
Any subsequently received response can be read from this same
|
||||
entry if it arrived within the configured timeout.
|
||||
Any additional delayed response received afterwards can be read
|
||||
from this same entry too if it arrived within the configured
|
||||
timeout.
|
||||
Each write to the entry causes one command request to be built
|
||||
and sent while the replies are read back one message at time
|
||||
(receiving an EOF at each message boundary).
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/errors
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw message errors facility; any kind of timed-out or
|
||||
generally unexpectedly received SCMI message, for instance <n>,
|
||||
can be read from this entry.
|
||||
Each read gives back one message at time (receiving an EOF at
|
||||
each message boundary).
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/notification
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw notification snooping facility; any notification
|
||||
emitted by the backend SCMI server, for instance <n>, can be
|
||||
read from this entry.
|
||||
Each read gives back one message at time (receiving an EOF at
|
||||
each message boundary).
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/reset
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw stack reset facility; writing a value to this entry
|
||||
causes the internal queues of any kind of received message,
|
||||
still pending to be read out for instance <n>, to be immediately
|
||||
flushed.
|
||||
Can be used to reset and clean the SCMI Raw stack between to
|
||||
different test-run.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/channels/<m>/message
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw synchronous message injection/snooping facility; write
|
||||
a complete SCMI synchronous command message (header included)
|
||||
in little-endian binary format to have it sent to the configured
|
||||
backend SCMI server for instance <n> through the <m> transport
|
||||
channel.
|
||||
Any subsequently received response can be read from this same
|
||||
entry if it arrived on channel <m> within the configured
|
||||
timeout.
|
||||
Each write to the entry causes one command request to be built
|
||||
and sent while the replies are read back one message at time
|
||||
(receiving an EOF at each message boundary).
|
||||
Channel identifier <m> matches the SCMI protocol number which
|
||||
has been associated with this transport channel in the DT
|
||||
description, with base protocol number 0x10 being the default
|
||||
channel for this instance.
|
||||
Note that these per-channel entries rooted at <..>/channels
|
||||
exist only if the transport is configured to have more than
|
||||
one default channel.
|
||||
Users: Debugging, any userspace test suite
|
||||
|
||||
What: /sys/kernel/debug/scmi/<n>/raw/channels/<m>/message_async
|
||||
Date: March 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: cristian.marussi@arm.com
|
||||
Description: SCMI Raw asynchronous message injection/snooping facility; write
|
||||
a complete SCMI asynchronous command message (header included)
|
||||
in little-endian binary format to have it sent to the configured
|
||||
backend SCMI server for instance <n> through the <m> transport
|
||||
channel.
|
||||
Any subsequently received response can be read from this same
|
||||
entry if it arrived on channel <m> within the configured
|
||||
timeout.
|
||||
Any additional delayed response received afterwards can be read
|
||||
from this same entry too if it arrived within the configured
|
||||
timeout.
|
||||
Each write to the entry causes one command request to be built
|
||||
and sent while the replies are read back one message at time
|
||||
(receiving an EOF at each message boundary).
|
||||
Channel identifier <m> matches the SCMI protocol number which
|
||||
has been associated with this transport channel in the DT
|
||||
description, with base protocol number 0x10 being the default
|
||||
channel for this instance.
|
||||
Note that these per-channel entries rooted at <..>/channels
|
||||
exist only if the transport is configured to have more than
|
||||
one default channel.
|
||||
Users: Debugging, any userspace test suite
|
@@ -35,7 +35,7 @@ Description:
|
||||
[FIRMWARE_CHECK]
|
||||
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
|
||||
[KEXEC_CMDLINE] [KEY_CHECK] [CRITICAL_DATA]
|
||||
[SETXATTR_CHECK]
|
||||
[SETXATTR_CHECK][MMAP_CHECK_REQPROT]
|
||||
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
|
||||
[[^]MAY_EXEC]
|
||||
fsmagic:= hex value
|
||||
|
@@ -236,7 +236,7 @@ What: /sys/bus/coresight/devices/<memory_map>.[etm|ptm]/traceid
|
||||
Date: November 2014
|
||||
KernelVersion: 3.19
|
||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||
Description: (RW) Holds the trace ID that will appear in the trace stream
|
||||
Description: (RO) Holds the trace ID that will appear in the trace stream
|
||||
coming from this trace entity.
|
||||
|
||||
What: /sys/bus/coresight/devices/<memory_map>.[etm|ptm]/trigger_event
|
||||
|
13
Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
Normal file
13
Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
Normal file
@@ -0,0 +1,13 @@
|
||||
What: /sys/bus/coresight/devices/<tpdm-name>/integration_test
|
||||
Date: January 2023
|
||||
KernelVersion 6.2
|
||||
Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
|
||||
Description:
|
||||
(Write) Run integration test for tpdm. Integration test
|
||||
will generate test data for tpdm. It can help to make
|
||||
sure that the trace path is enabled and the link configurations
|
||||
are fine.
|
||||
|
||||
Accepts only one of the 2 values - 1 or 2.
|
||||
1 : Generate 64 bits data
|
||||
2 : Generate 32 bits data
|
@@ -0,0 +1,31 @@
|
||||
What: /sys/bus/coresight/devices/ultra_smb<N>/enable_sink
|
||||
Date: January 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Junhao He <hejunhao3@huawei.com>
|
||||
Description: (RW) Add/remove a SMB device from a trace path. There can be
|
||||
multiple sources for a single SMB device.
|
||||
|
||||
What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/buf_size
|
||||
Date: January 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Junhao He <hejunhao3@huawei.com>
|
||||
Description: (RO) Shows the buffer size of each UltraSoc SMB device.
|
||||
|
||||
What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/buf_status
|
||||
Date: January 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Junhao He <hejunhao3@huawei.com>
|
||||
Description: (RO) Shows the value of UltraSoc SMB status register.
|
||||
BIT(0) is zero means buffer is empty.
|
||||
|
||||
What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/read_pos
|
||||
Date: January 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Junhao He <hejunhao3@huawei.com>
|
||||
Description: (RO) Shows the value of UltraSoc SMB Read Pointer register.
|
||||
|
||||
What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/write_pos
|
||||
Date: January 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Junhao He <hejunhao3@huawei.com>
|
||||
Description: (RO) Shows the value of UltraSoc SMB Write Pointer register.
|
@@ -1,22 +1,19 @@
|
||||
What: /sys/bus/css/devices/.../type
|
||||
Date: March 2008
|
||||
Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
linux-s390@vger.kernel.org
|
||||
Contact: linux-s390@vger.kernel.org
|
||||
Description: Contains the subchannel type, as reported by the hardware.
|
||||
This attribute is present for all subchannel types.
|
||||
|
||||
What: /sys/bus/css/devices/.../modalias
|
||||
Date: March 2008
|
||||
Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
linux-s390@vger.kernel.org
|
||||
Contact: linux-s390@vger.kernel.org
|
||||
Description: Contains the module alias as reported with uevents.
|
||||
It is of the format css:t<type> and present for all
|
||||
subchannel types.
|
||||
|
||||
What: /sys/bus/css/drivers/io_subchannel/.../chpids
|
||||
Date: December 2002
|
||||
Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
linux-s390@vger.kernel.org
|
||||
Contact: linux-s390@vger.kernel.org
|
||||
Description: Contains the ids of the channel paths used by this
|
||||
subchannel, as reported by the channel subsystem
|
||||
during subchannel recognition.
|
||||
@@ -26,8 +23,7 @@ Users: s390-tools, HAL
|
||||
|
||||
What: /sys/bus/css/drivers/io_subchannel/.../pimpampom
|
||||
Date: December 2002
|
||||
Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
linux-s390@vger.kernel.org
|
||||
Contact: linux-s390@vger.kernel.org
|
||||
Description: Contains the PIM/PAM/POM values, as reported by the
|
||||
channel subsystem when last queried by the common I/O
|
||||
layer (this implies that this attribute is not necessarily
|
||||
@@ -38,8 +34,7 @@ Users: s390-tools, HAL
|
||||
|
||||
What: /sys/bus/css/devices/.../driver_override
|
||||
Date: June 2019
|
||||
Contact: Cornelia Huck <cohuck@redhat.com>
|
||||
linux-s390@vger.kernel.org
|
||||
Contact: linux-s390@vger.kernel.org
|
||||
Description: This file allows the driver for a device to be specified. When
|
||||
specified, only a driver with a name matching the value written
|
||||
to driver_override will have an opportunity to bind to the
|
||||
|
@@ -90,6 +90,21 @@ Description:
|
||||
capability.
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/{port,endpoint}X/parent_dport
|
||||
Date: January, 2023
|
||||
KernelVersion: v6.3
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) CXL port objects are instantiated for each upstream port in
|
||||
a CXL/PCIe switch, and for each endpoint to map the
|
||||
corresponding memory device into the CXL port hierarchy. When a
|
||||
descendant CXL port (switch or endpoint) is enumerated it is
|
||||
useful to know which 'dport' object in the parent CXL port
|
||||
routes to this descendant. The 'parent_dport' symlink points to
|
||||
the device representing the downstream port of a CXL switch that
|
||||
routes to {port,endpoint}X.
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/portX/dportY
|
||||
Date: June, 2021
|
||||
KernelVersion: v5.14
|
||||
@@ -183,7 +198,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/endpointX/CDAT
|
||||
Date: July, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) If this sysfs entry is not present no DOE mailbox was
|
||||
@@ -194,7 +209,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/mode
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
|
||||
@@ -214,7 +229,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/dpa_resource
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) When a CXL decoder is of devtype "cxl_decoder_endpoint",
|
||||
@@ -225,7 +240,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/dpa_size
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
|
||||
@@ -245,7 +260,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/interleave_ways
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) The number of targets across which this decoder's host
|
||||
@@ -260,7 +275,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/interleave_granularity
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) The number of consecutive bytes of host physical address
|
||||
@@ -270,25 +285,25 @@ Description:
|
||||
interleave_granularity).
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/create_pmem_region
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
|
||||
Date: May, 2022, January, 2023
|
||||
KernelVersion: v6.0 (pmem), v6.3 (ram)
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Write a string in the form 'regionZ' to start the process
|
||||
of defining a new persistent memory region (interleave-set)
|
||||
within the decode range bounded by root decoder 'decoderX.Y'.
|
||||
The value written must match the current value returned from
|
||||
reading this attribute. An atomic compare exchange operation is
|
||||
done on write to assign the requested id to a region and
|
||||
allocate the region-id for the next creation attempt. EBUSY is
|
||||
returned if the region name written does not match the current
|
||||
cached value.
|
||||
of defining a new persistent, or volatile memory region
|
||||
(interleave-set) within the decode range bounded by root decoder
|
||||
'decoderX.Y'. The value written must match the current value
|
||||
returned from reading this attribute. An atomic compare exchange
|
||||
operation is done on write to assign the requested id to a
|
||||
region and allocate the region-id for the next creation attempt.
|
||||
EBUSY is returned if the region name written does not match the
|
||||
current cached value.
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y/delete_region
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(WO) Write a string in the form 'regionZ' to delete that region,
|
||||
@@ -297,17 +312,18 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/uuid
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Write a unique identifier for the region. This field must
|
||||
be set for persistent regions and it must not conflict with the
|
||||
UUID of another region.
|
||||
UUID of another region. For volatile ram regions this
|
||||
attribute is a read-only empty string.
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/interleave_granularity
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Set the number of consecutive bytes each device in the
|
||||
@@ -318,7 +334,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/interleave_ways
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Configures the number of devices participating in the
|
||||
@@ -328,7 +344,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/size
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) System physical address space to be consumed by the region.
|
||||
@@ -343,9 +359,20 @@ Description:
|
||||
results in the same address being allocated.
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/mode
|
||||
Date: January, 2023
|
||||
KernelVersion: v6.3
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) The mode of a region is established at region creation time
|
||||
and dictates the mode of the endpoint decoder that comprise the
|
||||
region. For more details on the possible modes see
|
||||
/sys/bus/cxl/devices/decoderX.Y/mode
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/resource
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RO) A region is a contiguous partition of a CXL root decoder
|
||||
@@ -357,7 +384,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/target[0..N]
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Write an endpoint decoder object name to 'targetX' where X
|
||||
@@ -376,7 +403,7 @@ Description:
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/commit
|
||||
Date: May, 2022
|
||||
KernelVersion: v5.20
|
||||
KernelVersion: v6.0
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
Description:
|
||||
(RW) Write a boolean 'true' string value to this attribute to
|
||||
|
@@ -0,0 +1,37 @@
|
||||
What: /sys/bus/event_source/devices/dmar*/format
|
||||
Date: Jan 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Kan Liang <kan.liang@linux.intel.com>
|
||||
Description: Read-only. Attribute group to describe the magic bits
|
||||
that go into perf_event_attr.config,
|
||||
perf_event_attr.config1 or perf_event_attr.config2 for
|
||||
the IOMMU pmu. (See also
|
||||
ABI/testing/sysfs-bus-event_source-devices-format).
|
||||
|
||||
Each attribute in this group defines a bit range in
|
||||
perf_event_attr.config, perf_event_attr.config1,
|
||||
or perf_event_attr.config2. All supported attributes
|
||||
are listed below (See the VT-d Spec 4.0 for possible
|
||||
attribute values)::
|
||||
|
||||
event = "config:0-27" - event ID
|
||||
event_group = "config:28-31" - event group ID
|
||||
|
||||
filter_requester_en = "config1:0" - Enable Requester ID filter
|
||||
filter_domain_en = "config1:1" - Enable Domain ID filter
|
||||
filter_pasid_en = "config1:2" - Enable PASID filter
|
||||
filter_ats_en = "config1:3" - Enable Address Type filter
|
||||
filter_page_table_en= "config1:4" - Enable Page Table Level filter
|
||||
filter_requester_id = "config1:16-31" - Requester ID filter
|
||||
filter_domain = "config1:32-47" - Domain ID filter
|
||||
filter_pasid = "config2:0-21" - PASID filter
|
||||
filter_ats = "config2:24-28" - Address Type filter
|
||||
filter_page_table = "config2:32-36" - Page Table Level filter
|
||||
|
||||
What: /sys/bus/event_source/devices/dmar*/cpumask
|
||||
Date: Jan 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Kan Liang <kan.liang@linux.intel.com>
|
||||
Description: Read-only. This file always returns the CPU to which the
|
||||
IOMMU pmu is bound for access to all IOMMU pmu performance
|
||||
monitoring events.
|
@@ -276,6 +276,15 @@ Description:
|
||||
|
||||
RW
|
||||
|
||||
What: /sys/class/hwmon/hwmonX/fanY_fault
|
||||
Description:
|
||||
Reports if a fan has reported failure.
|
||||
|
||||
- 1: Failed
|
||||
- 0: Ok
|
||||
|
||||
RO
|
||||
|
||||
What: /sys/class/hwmon/hwmonX/pwmY
|
||||
Description:
|
||||
Pulse width modulation fan control.
|
||||
|
19
Documentation/ABI/testing/sysfs-class-net-peak_usb
Normal file
19
Documentation/ABI/testing/sysfs-class-net-peak_usb
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
What: /sys/class/net/<iface>/peak_usb/can_channel_id
|
||||
Date: November 2022
|
||||
KernelVersion: 6.2
|
||||
Contact: Stephane Grosjean <s.grosjean@peak-system.com>
|
||||
Description:
|
||||
PEAK PCAN-USB devices support user-configurable CAN channel
|
||||
identifiers. Contrary to a USB serial number, these identifiers
|
||||
are writable and can be set per CAN interface. This means that
|
||||
if a USB device exports multiple CAN interfaces, each of them
|
||||
can be assigned a unique channel ID.
|
||||
This attribute provides read-only access to the currently
|
||||
configured value of the channel identifier. Depending on the
|
||||
device type, the identifier has a length of 8 or 32 bit. The
|
||||
value read from this attribute is always an 8 digit 32 bit
|
||||
hexadecimal value in big endian format. If the device only
|
||||
supports an 8 bit identifier, the upper 24 bit of the value are
|
||||
set to zero.
|
||||
|
@@ -437,7 +437,8 @@ What: /sys/class/power_supply/<supply_name>/present
|
||||
Date: May 2007
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Reports whether a battery is present or not in the system.
|
||||
Reports whether a battery is present or not in the system. If the
|
||||
property does not exist, the battery is considered to be present.
|
||||
|
||||
Access: Read
|
||||
|
||||
|
19
Documentation/ABI/testing/sysfs-class-power-rt9467
Normal file
19
Documentation/ABI/testing/sysfs-class-power-rt9467
Normal file
@@ -0,0 +1,19 @@
|
||||
What: /sys/class/power_supply/rt9467-*/sysoff_enable
|
||||
Date: Feb 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: ChiaEn Wu <chiaen_wu@richtek.com>
|
||||
Description:
|
||||
This entry allows enabling the sysoff mode of rt9467 charger
|
||||
devices.
|
||||
If enabled and the input is removed, the internal battery FET
|
||||
is turned off to reduce the leakage from the BAT pin. See
|
||||
device datasheet for details. It's commonly used when the
|
||||
product enter shipping stage. After entering shipping mode,
|
||||
only 'VBUS' or 'Power key" pressed can make it leave this mode.
|
||||
'Disable' also can help to leave it, but it's more like to
|
||||
abort the action before the device really enter shipping mode.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values:
|
||||
- 1: enabled
|
||||
- 0: disabled
|
32
Documentation/ABI/testing/sysfs-class-power-rt9471
Normal file
32
Documentation/ABI/testing/sysfs-class-power-rt9471
Normal file
@@ -0,0 +1,32 @@
|
||||
What: /sys/class/power_supply/rt9471-*/sysoff_enable
|
||||
Date: Feb 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: ChiYuan Huang <cy_huang@richtek.com>
|
||||
Description:
|
||||
This entry allows enabling the sysoff mode of rt9471 charger devices.
|
||||
If enabled and the input is removed, the internal battery FET is turned
|
||||
off to reduce the leakage from the BAT pin. See device datasheet for details.
|
||||
It's commonly used when the product enter shipping stage. After entering
|
||||
shipping mode, only 'VBUS' or 'Power key" pressed can make it leave this
|
||||
mode. 'Disable' also can help to leave it, but it's more like to abort
|
||||
the action before the device really enter shipping mode.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values:
|
||||
- 1: enabled
|
||||
- 0: disabled
|
||||
|
||||
What: /sys/class/power_supply/rt9471-*/port_detect_enable
|
||||
Date: Feb 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: ChiYuan Huang <cy_huang@richtek.com>
|
||||
Description:
|
||||
This entry allows enabling the USB BC12 port detect function of rt9471 charger
|
||||
devices. If enabled and VBUS is inserted, device will start to do the BC12
|
||||
port detect and report the usb port type when port detect is done. See
|
||||
datasheet for details. Normally controlled when TypeC/USBPD port integrated.
|
||||
|
||||
Access: Read, Write
|
||||
Valid values:
|
||||
- 1: enabled
|
||||
- 0: disabled
|
@@ -69,7 +69,7 @@ Description:
|
||||
This file contains boolean value that tells does the device
|
||||
support both source and sink power roles.
|
||||
|
||||
What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/usb_suspend_supported
|
||||
What: /sys/class/usb_power_delivery/.../source-capabilities/1:fixed_supply/usb_suspend_supported
|
||||
Date: May 2022
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
Description:
|
||||
@@ -78,6 +78,15 @@ Description:
|
||||
will follow the USB 2.0 and USB 3.2 rules for suspend and
|
||||
resume.
|
||||
|
||||
What: /sys/class/usb_power_delivery/.../sink-capabilities/1:fixed_supply/higher_capability
|
||||
Date: February 2023
|
||||
Contact: Saranya Gopal <saranya.gopal@linux.intel.com>
|
||||
Description:
|
||||
This file shows the value of the Higher capability bit in
|
||||
vsafe5V Fixed Supply Object. If the bit is set, then the sink
|
||||
needs more than vsafe5V(eg. 12 V) to provide full functionality.
|
||||
Valid values: 0, 1
|
||||
|
||||
What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/unconstrained_power
|
||||
Date: May 2022
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
|
@@ -6,6 +6,19 @@ Description:
|
||||
device at boot. It is equivalent to WDIOC_GETBOOTSTATUS of
|
||||
ioctl interface.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/options
|
||||
Date: April 2023
|
||||
Contact: Thomas Weißschuh
|
||||
Description:
|
||||
It is a read only file. It contains options of watchdog device.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/fw_version
|
||||
Date: April 2023
|
||||
Contact: Thomas Weißschuh
|
||||
Description:
|
||||
It is a read only file. It contains firmware version of
|
||||
watchdog device.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/identity
|
||||
Date: August 2015
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
|
@@ -201,7 +201,19 @@ What: /sys/class/habanalabs/hl<n>/status
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: ogabbay@kernel.org
|
||||
Description: Status of the card: "Operational", "Malfunction", "In reset".
|
||||
Description: Status of the card:
|
||||
|
||||
* "operational" - Device is available for work.
|
||||
* "in reset" - Device is going through reset, will be
|
||||
available shortly.
|
||||
* "disabled" - Device is not usable.
|
||||
* "needs reset" - Device is not usable until a hard reset
|
||||
is initiated.
|
||||
* "in device creation" - Device is not available yet, as it
|
||||
is still initializing.
|
||||
* "in reset after device release" - Device is going through
|
||||
a compute-reset which is executed after a device release
|
||||
(relevant for Gaudi2 only).
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/thermal_ver
|
||||
Date: Jan 2019
|
||||
|
@@ -1,4 +1,4 @@
|
||||
What: /sys/bus/spi/devices/.../bmc_version
|
||||
What: /sys/bus/.../drivers/intel-m10-bmc/.../bmc_version
|
||||
Date: June 2020
|
||||
KernelVersion: 5.10
|
||||
Contact: Xu Yilun <yilun.xu@intel.com>
|
||||
@@ -6,7 +6,7 @@ Description: Read only. Returns the hardware build version of Intel
|
||||
MAX10 BMC chip.
|
||||
Format: "0x%x".
|
||||
|
||||
What: /sys/bus/spi/devices/.../bmcfw_version
|
||||
What: /sys/bus/.../drivers/intel-m10-bmc/.../bmcfw_version
|
||||
Date: June 2020
|
||||
KernelVersion: 5.10
|
||||
Contact: Xu Yilun <yilun.xu@intel.com>
|
||||
@@ -14,7 +14,7 @@ Description: Read only. Returns the firmware version of Intel MAX10
|
||||
BMC chip.
|
||||
Format: "0x%x".
|
||||
|
||||
What: /sys/bus/spi/devices/.../mac_address
|
||||
What: /sys/bus/.../drivers/intel-m10-bmc/.../mac_address
|
||||
Date: January 2021
|
||||
KernelVersion: 5.12
|
||||
Contact: Russ Weight <russell.h.weight@intel.com>
|
||||
@@ -25,7 +25,7 @@ Description: Read only. Returns the first MAC address in a block
|
||||
space.
|
||||
Format: "%02x:%02x:%02x:%02x:%02x:%02x".
|
||||
|
||||
What: /sys/bus/spi/devices/.../mac_count
|
||||
What: /sys/bus/.../drivers/intel-m10-bmc/.../mac_count
|
||||
Date: January 2021
|
||||
KernelVersion: 5.12
|
||||
Contact: Russ Weight <russell.h.weight@intel.com>
|
||||
|
@@ -1,6 +1,6 @@
|
||||
What: /sys/bus/pci/devices/<BDF>/qat/state
|
||||
Date: June 2022
|
||||
KernelVersion: 5.20
|
||||
KernelVersion: 6.0
|
||||
Contact: qat-linux@intel.com
|
||||
Description: (RW) Reports the current state of the QAT device. Write to
|
||||
the file to start or stop the device.
|
||||
@@ -18,7 +18,7 @@ Description: (RW) Reports the current state of the QAT device. Write to
|
||||
|
||||
What: /sys/bus/pci/devices/<BDF>/qat/cfg_services
|
||||
Date: June 2022
|
||||
KernelVersion: 5.20
|
||||
KernelVersion: 6.0
|
||||
Contact: qat-linux@intel.com
|
||||
Description: (RW) Reports the current configuration of the QAT device.
|
||||
Write to the file to change the configured services.
|
||||
|
@@ -47,3 +47,18 @@ Description:
|
||||
USB SuperSpeed protocol. From user perspective pin assignments C
|
||||
and E are equal, where all channels on the connector are used
|
||||
for carrying DisplayPort protocol (allowing higher resolutions).
|
||||
|
||||
What: /sys/bus/typec/devices/.../displayport/hpd
|
||||
Date: Dec 2022
|
||||
Contact: Badhri Jagan Sridharan <badhri@google.com>
|
||||
Description:
|
||||
VESA DisplayPort Alt Mode on USB Type-C Standard defines how
|
||||
HotPlugDetect(HPD) shall be supported on the USB-C connector when
|
||||
operating in DisplayPort Alt Mode. This is a read only node which
|
||||
reflects the current state of HPD.
|
||||
|
||||
Valid values:
|
||||
- 1: when HPD’s logical state is high (HPD_High) as defined
|
||||
by VESA DisplayPort Alt Mode on USB Type-C Standard.
|
||||
- 0 when HPD’s logical state is low (HPD_Low) as defined by
|
||||
VESA DisplayPort Alt Mode on USB Type-C Standard.
|
||||
|
@@ -19,6 +19,24 @@ Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: Available instances left of the device
|
||||
Return -ENODEV if uacce_ops get_available_instances is not provided
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/isolate_strategy
|
||||
Date: Nov 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: (RW) A sysfs node that configure the error threshold for the hardware
|
||||
isolation strategy. This size is a configured integer value, which is the
|
||||
number of threshold for hardware errors occurred in one hour. The default is 0.
|
||||
0 means never isolate the device. The maximum value is 65535. You can write
|
||||
a number of threshold based on your hardware.
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/isolate
|
||||
Date: Nov 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-accelerators@lists.ozlabs.org
|
||||
Description: (R) A sysfs node that read the device isolated state. The value 1
|
||||
means the device is unavailable. The 0 means the device is
|
||||
available.
|
||||
|
||||
What: /sys/class/uacce/<dev_name>/algorithms
|
||||
Date: Feb 2020
|
||||
KernelVersion: 5.7
|
||||
|
16
Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager
Normal file
16
Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager
Normal file
@@ -0,0 +1,16 @@
|
||||
What: /sys/devices/platform/amba_pl/<dev>/errcnt
|
||||
Date: Nov 2022
|
||||
Contact: appana.durga.kedareswara.rao@amd.com
|
||||
Description: This control file provides the fault detection count.
|
||||
This file cannot be written.
|
||||
Example:
|
||||
# cat /sys/devices/platform/amba_pl/44a10000.tmr_manager/errcnt
|
||||
1
|
||||
|
||||
What: /sys/devices/platform/amba_pl/<dev>/dis_block_break
|
||||
Date: Nov 2022
|
||||
Contact: appana.durga.kedareswara.rao@amd.com
|
||||
Description: Write any value to it, This control file enables the break signal.
|
||||
This file is write only.
|
||||
Example:
|
||||
# echo <any value> > /sys/devices/platform/amba_pl/44a10000.tmr_manager/dis_block_break
|
@@ -4,7 +4,8 @@ Contact: "Huang Jianan" <huangjianan@oppo.com>
|
||||
Description: Shows all enabled kernel features.
|
||||
Supported features:
|
||||
zero_padding, compr_cfgs, big_pcluster, chunked_file,
|
||||
device_table, compr_head2, sb_chksum.
|
||||
device_table, compr_head2, sb_chksum, ztailpacking,
|
||||
dedupe, fragments.
|
||||
|
||||
What: /sys/fs/erofs/<disk>/sync_decompress
|
||||
Date: November 2021
|
||||
|
@@ -49,16 +49,23 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description: Controls the in-place-update policy.
|
||||
updates in f2fs. User can set:
|
||||
|
||||
==== =================
|
||||
0x01 F2FS_IPU_FORCE
|
||||
0x02 F2FS_IPU_SSR
|
||||
0x04 F2FS_IPU_UTIL
|
||||
0x08 F2FS_IPU_SSR_UTIL
|
||||
0x10 F2FS_IPU_FSYNC
|
||||
0x20 F2FS_IPU_ASYNC
|
||||
0x40 F2FS_IPU_NOCACHE
|
||||
0x80 F2FS_IPU_HONOR_OPU_WRITE
|
||||
==== =================
|
||||
===== =============== ===================================================
|
||||
value policy description
|
||||
0x00 DISABLE disable IPU(=default option in LFS mode)
|
||||
0x01 FORCE all the time
|
||||
0x02 SSR if SSR mode is activated
|
||||
0x04 UTIL if FS utilization is over threashold
|
||||
0x08 SSR_UTIL if SSR mode is activated and FS utilization is over
|
||||
threashold
|
||||
0x10 FSYNC activated in fsync path only for high performance
|
||||
flash storages. IPU will be triggered only if the
|
||||
# of dirty pages over min_fsync_blocks.
|
||||
(=default option)
|
||||
0x20 ASYNC do IPU given by asynchronous write requests
|
||||
0x40 NOCACHE disable IPU bio cache
|
||||
0x80 HONOR_OPU_WRITE use OPU write prior to IPU write if inode has
|
||||
FI_OPU_WRITE flag
|
||||
===== =============== ===================================================
|
||||
|
||||
Refer segment.h for details.
|
||||
|
||||
@@ -669,3 +676,56 @@ Contact: "Ping Xiong" <xiongping1@xiaomi.com>
|
||||
Description: When DATA SEPARATION is on, it controls the age threshold to indicate
|
||||
the data blocks as warm. By default it was initialized as 2621440 blocks
|
||||
(equals to 10GB).
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/fault_rate
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong@oppo.com>
|
||||
Contact: "Chao Yu" <chao@kernel.org>
|
||||
Description: Enable fault injection in all supported types with
|
||||
specified injection rate.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/fault_type
|
||||
Date: May 2016
|
||||
Contact: "Sheng Yong" <shengyong@oppo.com>
|
||||
Contact: "Chao Yu" <chao@kernel.org>
|
||||
Description: Support configuring fault injection type, should be
|
||||
enabled with fault_injection option, fault type value
|
||||
is shown below, it supports single or combined type.
|
||||
|
||||
=================== ===========
|
||||
Type_Name Type_Value
|
||||
=================== ===========
|
||||
FAULT_KMALLOC 0x000000001
|
||||
FAULT_KVMALLOC 0x000000002
|
||||
FAULT_PAGE_ALLOC 0x000000004
|
||||
FAULT_PAGE_GET 0x000000008
|
||||
FAULT_ALLOC_BIO 0x000000010 (obsolete)
|
||||
FAULT_ALLOC_NID 0x000000020
|
||||
FAULT_ORPHAN 0x000000040
|
||||
FAULT_BLOCK 0x000000080
|
||||
FAULT_DIR_DEPTH 0x000000100
|
||||
FAULT_EVICT_INODE 0x000000200
|
||||
FAULT_TRUNCATE 0x000000400
|
||||
FAULT_READ_IO 0x000000800
|
||||
FAULT_CHECKPOINT 0x000001000
|
||||
FAULT_DISCARD 0x000002000
|
||||
FAULT_WRITE_IO 0x000004000
|
||||
FAULT_SLAB_ALLOC 0x000008000
|
||||
FAULT_DQUOT_INIT 0x000010000
|
||||
FAULT_LOCK_OP 0x000020000
|
||||
FAULT_BLKADDR 0x000040000
|
||||
=================== ===========
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_io_aware_gran
|
||||
Date: January 2023
|
||||
Contact: "Yangtao Li" <frank.li@vivo.com>
|
||||
Description: Controls background discard granularity of inner discard thread
|
||||
when is not in idle. Inner thread will not issue discards with size that
|
||||
is smaller than granularity. The unit size is one block(4KB), now only
|
||||
support configuring in range of [0, 512].
|
||||
Default: 512
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/last_age_weight
|
||||
Date: January 2023
|
||||
Contact: "Ping Xiong" <xiongping1@xiaomi.com>
|
||||
Description: When DATA SEPARATION is on, it controls the weight of last data block age.
|
||||
|
10
Documentation/ABI/testing/sysfs-kernel-address_bits
Normal file
10
Documentation/ABI/testing/sysfs-kernel-address_bits
Normal file
@@ -0,0 +1,10 @@
|
||||
What: /sys/kernel/address_bit
|
||||
Date: May 2023
|
||||
KernelVersion: 6.3
|
||||
Contact: Thomas Weißschuh <linux@weissschuh.net>
|
||||
Description:
|
||||
The address size of the running kernel in bits.
|
||||
|
||||
Access: Read
|
||||
|
||||
Users: util-linux
|
@@ -258,6 +258,35 @@ Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the low
|
||||
watermark of the scheme in permil.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/nr_filters
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a number 'N' to this file creates the number of
|
||||
directories for setting filters of the scheme named '0' to
|
||||
'N-1' under the filters/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/type
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing to and reading from this file sets and gets the type of
|
||||
the memory of the interest. 'anon' for anonymous pages, or
|
||||
'memcg' for specific memory cgroup can be written and read.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: If 'memcg' is written to the 'type' file, writing to and
|
||||
reading from this file sets and gets the path to the memory
|
||||
cgroup of the interest.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing 'Y' or 'N' to this file sets whether to filter out
|
||||
pages that do or do not match to the 'type' and 'memcg_path',
|
||||
respectively. Filter out means the action of the scheme will
|
||||
not be applied to.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
|
@@ -18,6 +18,14 @@ Description: A string indicating which backend is in use by the firmware.
|
||||
This determines the format of the variable and the accepted
|
||||
format of variable updates.
|
||||
|
||||
On powernv/OPAL, this value is provided by the OPAL firmware
|
||||
and is expected to be "ibm,edk2-compat-v1".
|
||||
|
||||
On pseries/PLPKS, this is generated by the kernel based on the
|
||||
version number in the SB_VERSION variable in the keystore, and
|
||||
has the form "ibm,plpks-sb-v<version>", or
|
||||
"ibm,plpks-sb-unknown" if there is no SB_VERSION variable.
|
||||
|
||||
What: /sys/firmware/secvar/vars/<variable name>
|
||||
Date: August 2019
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
@@ -34,7 +42,7 @@ Description: An integer representation of the size of the content of the
|
||||
|
||||
What: /sys/firmware/secvar/vars/<variable_name>/data
|
||||
Date: August 2019
|
||||
Contact: Nayna Jain h<nayna@linux.ibm.com>
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: A read-only file containing the value of the variable. The size
|
||||
of the file represents the maximum size of the variable data.
|
||||
|
||||
@@ -44,3 +52,68 @@ Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: A write-only file that is used to submit the new value for the
|
||||
variable. The size of the file represents the maximum size of
|
||||
the variable data that can be written.
|
||||
|
||||
What: /sys/firmware/secvar/config
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: This optional directory contains read-only config attributes as
|
||||
defined by the secure variable implementation. All data is in
|
||||
ASCII format. The directory is only created if the backing
|
||||
implementation provides variables to populate it, which at
|
||||
present is only PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/version
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Config version as reported by the hypervisor in ASCII decimal
|
||||
format.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/max_object_size
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Maximum allowed size of objects in the keystore in bytes,
|
||||
represented in ASCII decimal format.
|
||||
|
||||
This is not necessarily the same as the max size that can be
|
||||
written to an update file as writes can contain more than
|
||||
object data, you should use the size of the update file for
|
||||
that purpose.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/total_size
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Total size of the PLPKS in bytes, represented in ASCII decimal
|
||||
format.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/used_space
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Current space consumed by the key store, in bytes, represented
|
||||
in ASCII decimal format.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/supported_policies
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Bitmask of supported policy flags by the hypervisor,
|
||||
represented as an 8 byte hexadecimal ASCII string. Consult the
|
||||
hypervisor documentation for what these flags are.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
||||
What: /sys/firmware/secvar/config/signed_update_algorithms
|
||||
Date: February 2023
|
||||
Contact: Nayna Jain <nayna@linux.ibm.com>
|
||||
Description: Bitmask of flags indicating which algorithms the hypervisor
|
||||
supports for signed update of objects, represented as a 16 byte
|
||||
hexadecimal ASCII string. Consult the hypervisor documentation
|
||||
for what these flags mean.
|
||||
|
||||
Currently only provided by PLPKS on the pseries platform.
|
||||
|
@@ -1,6 +1,9 @@
|
||||
if COMPILE_TEST
|
||||
|
||||
menu "Documentation"
|
||||
|
||||
config WARN_MISSING_DOCUMENTS
|
||||
bool "Warn if there's a missing documentation file"
|
||||
depends on COMPILE_TEST
|
||||
help
|
||||
It is not uncommon that a document gets renamed.
|
||||
This option makes the Kernel to check for missing dependencies,
|
||||
@@ -11,7 +14,6 @@ config WARN_MISSING_DOCUMENTS
|
||||
|
||||
config WARN_ABI_ERRORS
|
||||
bool "Warn if there are errors at ABI files"
|
||||
depends on COMPILE_TEST
|
||||
help
|
||||
The files under Documentation/ABI should follow what's
|
||||
described at Documentation/ABI/README. Yet, as they're manually
|
||||
@@ -20,3 +22,7 @@ config WARN_ABI_ERRORS
|
||||
scripts/get_abi.pl. Add a check to verify them.
|
||||
|
||||
If unsure, select 'N'.
|
||||
|
||||
endmenu
|
||||
|
||||
endif
|
||||
|
@@ -28,7 +28,7 @@ BUILDDIR = $(obj)/output
|
||||
PDFLATEX = xelatex
|
||||
LATEXOPTS = -interaction=batchmode -no-shell-escape
|
||||
|
||||
ifeq ($(KBUILD_VERBOSE),0)
|
||||
ifeq ($(findstring 1, $(KBUILD_VERBOSE)),)
|
||||
SPHINXOPTS += "-q"
|
||||
endif
|
||||
|
||||
|
@@ -1,8 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
Linux PCI Bus Subsystem
|
||||
=======================
|
||||
=================
|
||||
PCI Bus Subsystem
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
@@ -8,7 +8,7 @@ Although RCU is usually used to protect read-mostly data structures,
|
||||
it is possible to use RCU to provide dynamic non-maskable interrupt
|
||||
handlers, as well as dynamic irq handlers. This document describes
|
||||
how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
|
||||
work in "arch/x86/kernel/traps.c".
|
||||
work in an old version of "arch/x86/kernel/traps.c".
|
||||
|
||||
The relevant pieces of code are listed below, each followed by a
|
||||
brief explanation::
|
||||
@@ -116,7 +116,7 @@ Answer to Quick Quiz:
|
||||
|
||||
This same sad story can happen on other CPUs when using
|
||||
a compiler with aggressive pointer-value speculation
|
||||
optimizations.
|
||||
optimizations. (But please don't!)
|
||||
|
||||
More important, the rcu_dereference_sched() makes it
|
||||
clear to someone reading the code that the pointer is
|
||||
|
@@ -38,7 +38,7 @@ by having call_rcu() directly invoke its arguments only if it was called
|
||||
from process context. However, this can fail in a similar manner.
|
||||
|
||||
Suppose that an RCU-based algorithm again scans a linked list containing
|
||||
elements A, B, and C in process contexts, but that it invokes a function
|
||||
elements A, B, and C in process context, but that it invokes a function
|
||||
on each element as it is scanned. Suppose further that this function
|
||||
deletes element B from the list, then passes it to call_rcu() for deferred
|
||||
freeing. This may be a bit unconventional, but it is perfectly legal
|
||||
@@ -59,7 +59,8 @@ Example 3: Death by Deadlock
|
||||
Suppose that call_rcu() is invoked while holding a lock, and that the
|
||||
callback function must acquire this same lock. In this case, if
|
||||
call_rcu() were to directly invoke the callback, the result would
|
||||
be self-deadlock.
|
||||
be self-deadlock *even if* this invocation occurred from a later
|
||||
call_rcu() invocation a full grace period later.
|
||||
|
||||
In some cases, it would possible to restructure to code so that
|
||||
the call_rcu() is delayed until after the lock is released. However,
|
||||
@@ -85,6 +86,14 @@ Quick Quiz #2:
|
||||
|
||||
:ref:`Answers to Quick Quiz <answer_quick_quiz_up>`
|
||||
|
||||
It is important to note that userspace RCU implementations *do*
|
||||
permit call_rcu() to directly invoke callbacks, but only if a full
|
||||
grace period has elapsed since those callbacks were queued. This is
|
||||
the case because some userspace environments are extremely constrained.
|
||||
Nevertheless, people writing userspace RCU implementations are strongly
|
||||
encouraged to avoid invoking callbacks from call_rcu(), thus obtaining
|
||||
the deadlock-avoidance benefits called out above.
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
|
@@ -69,9 +69,8 @@ checking of rcu_dereference() primitives:
|
||||
value of the pointer itself, for example, against NULL.
|
||||
|
||||
The rcu_dereference_check() check expression can be any boolean
|
||||
expression, but would normally include a lockdep expression. However,
|
||||
any boolean expression can be used. For a moderately ornate example,
|
||||
consider the following::
|
||||
expression, but would normally include a lockdep expression. For a
|
||||
moderately ornate example, consider the following::
|
||||
|
||||
file = rcu_dereference_check(fdt->fd[fd],
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
@@ -97,10 +96,10 @@ code, it could instead be written as follows::
|
||||
atomic_read(&files->count) == 1);
|
||||
|
||||
This would verify cases #2 and #3 above, and furthermore lockdep would
|
||||
complain if this was used in an RCU read-side critical section unless one
|
||||
of these two cases held. Because rcu_dereference_protected() omits all
|
||||
barriers and compiler constraints, it generates better code than do the
|
||||
other flavors of rcu_dereference(). On the other hand, it is illegal
|
||||
complain even if this was used in an RCU read-side critical section unless
|
||||
one of these two cases held. Because rcu_dereference_protected() omits
|
||||
all barriers and compiler constraints, it generates better code than do
|
||||
the other flavors of rcu_dereference(). On the other hand, it is illegal
|
||||
to use rcu_dereference_protected() if either the RCU-protected pointer
|
||||
or the RCU-protected data that it points to can change concurrently.
|
||||
|
||||
|
@@ -77,15 +77,17 @@ Frequently Asked Questions
|
||||
search for the string "Patent" in Documentation/RCU/RTFP.txt to find them.
|
||||
Of these, one was allowed to lapse by the assignee, and the
|
||||
others have been contributed to the Linux kernel under GPL.
|
||||
Many (but not all) have long since expired.
|
||||
There are now also LGPL implementations of user-level RCU
|
||||
available (https://liburcu.org/).
|
||||
|
||||
- I hear that RCU needs work in order to support realtime kernels?
|
||||
|
||||
Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
|
||||
Realtime-friendly RCU are enabled via the CONFIG_PREEMPTION
|
||||
kernel configuration parameter.
|
||||
|
||||
- Where can I find more information on RCU?
|
||||
|
||||
See the Documentation/RCU/RTFP.txt file.
|
||||
Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
|
||||
Or point your browser at (https://docs.google.com/document/d/1X0lThx8OK0ZgLMqVoXiR4ZrGURHrXK6NyLRbeXe3Xac/edit)
|
||||
or (https://docs.google.com/document/d/1GCdQC8SDbb54W1shjEXqGZ0Rq8a6kIeYutdSIajfpLA/edit?usp=sharing).
|
||||
|
@@ -19,8 +19,9 @@ Follow these rules to keep your RCU code working properly:
|
||||
can reload the value, and won't your code have fun with two
|
||||
different values for a single pointer! Without rcu_dereference(),
|
||||
DEC Alpha can load a pointer, dereference that pointer, and
|
||||
return data preceding initialization that preceded the store of
|
||||
the pointer.
|
||||
return data preceding initialization that preceded the store
|
||||
of the pointer. (As noted later, in recent kernels READ_ONCE()
|
||||
also prevents DEC Alpha from playing these tricks.)
|
||||
|
||||
In addition, the volatile cast in rcu_dereference() prevents the
|
||||
compiler from deducing the resulting pointer value. Please see
|
||||
@@ -34,7 +35,7 @@ Follow these rules to keep your RCU code working properly:
|
||||
takes on the role of the lockless_dereference() primitive that
|
||||
was removed in v4.15.
|
||||
|
||||
- You are only permitted to use rcu_dereference on pointer values.
|
||||
- You are only permitted to use rcu_dereference() on pointer values.
|
||||
The compiler simply knows too much about integral values to
|
||||
trust it to carry dependencies through integer operations.
|
||||
There are a very few exceptions, namely that you can temporarily
|
||||
@@ -240,6 +241,7 @@ precautions. To see this, consider the following code fragment::
|
||||
struct foo *q;
|
||||
int r1, r2;
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(gp2);
|
||||
if (p == NULL)
|
||||
return;
|
||||
@@ -248,7 +250,10 @@ precautions. To see this, consider the following code fragment::
|
||||
if (p == q) {
|
||||
/* The compiler decides that q->c is same as p->c. */
|
||||
r2 = p->c; /* Could get 44 on weakly order system. */
|
||||
} else {
|
||||
r2 = p->c - r1; /* Unconditional access to p->c. */
|
||||
}
|
||||
rcu_read_unlock();
|
||||
do_something_with(r1, r2);
|
||||
}
|
||||
|
||||
@@ -297,6 +302,7 @@ Then one approach is to use locking, for example, as follows::
|
||||
struct foo *q;
|
||||
int r1, r2;
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(gp2);
|
||||
if (p == NULL)
|
||||
return;
|
||||
@@ -306,7 +312,12 @@ Then one approach is to use locking, for example, as follows::
|
||||
if (p == q) {
|
||||
/* The compiler decides that q->c is same as p->c. */
|
||||
r2 = p->c; /* Locking guarantees r2 == 144. */
|
||||
} else {
|
||||
spin_lock(&q->lock);
|
||||
r2 = q->c - r1;
|
||||
spin_unlock(&q->lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
spin_unlock(&p->lock);
|
||||
do_something_with(r1, r2);
|
||||
}
|
||||
@@ -364,7 +375,7 @@ the exact value of "p" even in the not-equals case. This allows the
|
||||
compiler to make the return values independent of the load from "gp",
|
||||
in turn destroying the ordering between this load and the loads of the
|
||||
return values. This can result in "p->b" returning pre-initialization
|
||||
garbage values.
|
||||
garbage values on weakly ordered systems.
|
||||
|
||||
In short, rcu_dereference() is *not* optional when you are going to
|
||||
dereference the resulting pointer.
|
||||
@@ -430,7 +441,7 @@ member of the rcu_dereference() to use in various situations:
|
||||
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||
-----------------------------------------
|
||||
|
||||
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||
The sparse static-analysis tool checks for non-RCU access to RCU-protected
|
||||
pointers, which can result in "interesting" bugs due to compiler
|
||||
optimizations involving invented loads and perhaps also load tearing.
|
||||
For example, suppose someone mistakenly does something like this::
|
||||
|
@@ -5,37 +5,12 @@ RCU and Unloadable Modules
|
||||
|
||||
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
|
||||
|
||||
RCU (read-copy update) is a synchronization mechanism that can be thought
|
||||
of as a replacement for read-writer locking (among other things), but with
|
||||
very low-overhead readers that are immune to deadlock, priority inversion,
|
||||
and unbounded latency. RCU read-side critical sections are delimited
|
||||
by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION
|
||||
kernels, generate no code whatsoever.
|
||||
|
||||
This means that RCU writers are unaware of the presence of concurrent
|
||||
readers, so that RCU updates to shared data must be undertaken quite
|
||||
carefully, leaving an old version of the data structure in place until all
|
||||
pre-existing readers have finished. These old versions are needed because
|
||||
such readers might hold a reference to them. RCU updates can therefore be
|
||||
rather expensive, and RCU is thus best suited for read-mostly situations.
|
||||
|
||||
How can an RCU writer possibly determine when all readers are finished,
|
||||
given that readers might well leave absolutely no trace of their
|
||||
presence? There is a synchronize_rcu() primitive that blocks until all
|
||||
pre-existing readers have completed. An updater wishing to delete an
|
||||
element p from a linked list might do the following, while holding an
|
||||
appropriate lock, of course::
|
||||
|
||||
list_del_rcu(p);
|
||||
synchronize_rcu();
|
||||
kfree(p);
|
||||
|
||||
But the above code cannot be used in IRQ context -- the call_rcu()
|
||||
primitive must be used instead. This primitive takes a pointer to an
|
||||
rcu_head struct placed within the RCU-protected data structure and
|
||||
another pointer to a function that may be invoked later to free that
|
||||
structure. Code to delete an element p from the linked list from IRQ
|
||||
context might then be as follows::
|
||||
RCU updaters sometimes use call_rcu() to initiate an asynchronous wait for
|
||||
a grace period to elapse. This primitive takes a pointer to an rcu_head
|
||||
struct placed within the RCU-protected data structure and another pointer
|
||||
to a function that may be invoked later to free that structure. Code to
|
||||
delete an element p from the linked list from IRQ context might then be
|
||||
as follows::
|
||||
|
||||
list_del_rcu(p);
|
||||
call_rcu(&p->rcu, p_callback);
|
||||
@@ -54,7 +29,7 @@ IRQ context. The function p_callback() might be defined as follows::
|
||||
Unloading Modules That Use call_rcu()
|
||||
-------------------------------------
|
||||
|
||||
But what if p_callback is defined in an unloadable module?
|
||||
But what if the p_callback() function is defined in an unloadable module?
|
||||
|
||||
If we unload the module while some RCU callbacks are pending,
|
||||
the CPUs executing these callbacks are going to be severely
|
||||
@@ -67,20 +42,21 @@ grace period to elapse, it does not wait for the callbacks to complete.
|
||||
|
||||
One might be tempted to try several back-to-back synchronize_rcu()
|
||||
calls, but this is still not guaranteed to work. If there is a very
|
||||
heavy RCU-callback load, then some of the callbacks might be deferred
|
||||
in order to allow other processing to proceed. Such deferral is required
|
||||
in realtime kernels in order to avoid excessive scheduling latencies.
|
||||
heavy RCU-callback load, then some of the callbacks might be deferred in
|
||||
order to allow other processing to proceed. For but one example, such
|
||||
deferral is required in realtime kernels in order to avoid excessive
|
||||
scheduling latencies.
|
||||
|
||||
|
||||
rcu_barrier()
|
||||
-------------
|
||||
|
||||
We instead need the rcu_barrier() primitive. Rather than waiting for
|
||||
a grace period to elapse, rcu_barrier() waits for all outstanding RCU
|
||||
callbacks to complete. Please note that rcu_barrier() does **not** imply
|
||||
synchronize_rcu(), in particular, if there are no RCU callbacks queued
|
||||
anywhere, rcu_barrier() is within its rights to return immediately,
|
||||
without waiting for a grace period to elapse.
|
||||
This situation can be handled by the rcu_barrier() primitive. Rather
|
||||
than waiting for a grace period to elapse, rcu_barrier() waits for all
|
||||
outstanding RCU callbacks to complete. Please note that rcu_barrier()
|
||||
does **not** imply synchronize_rcu(), in particular, if there are no RCU
|
||||
callbacks queued anywhere, rcu_barrier() is within its rights to return
|
||||
immediately, without waiting for anything, let alone a grace period.
|
||||
|
||||
Pseudo-code using rcu_barrier() is as follows:
|
||||
|
||||
@@ -89,83 +65,86 @@ Pseudo-code using rcu_barrier() is as follows:
|
||||
3. Allow the module to be unloaded.
|
||||
|
||||
There is also an srcu_barrier() function for SRCU, and you of course
|
||||
must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||
flavors of rcu_barrier() when unloading that module. For example, if
|
||||
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||
srcu_struct_2, then the following three lines of code will be required
|
||||
when unloading::
|
||||
must match the flavor of srcu_barrier() with that of call_srcu().
|
||||
If your module uses multiple srcu_struct structures, then it must also
|
||||
use multiple invocations of srcu_barrier() when unloading that module.
|
||||
For example, if it uses call_rcu(), call_srcu() on srcu_struct_1, and
|
||||
call_srcu() on srcu_struct_2, then the following three lines of code
|
||||
will be required when unloading::
|
||||
|
||||
1 rcu_barrier();
|
||||
2 srcu_barrier(&srcu_struct_1);
|
||||
3 srcu_barrier(&srcu_struct_2);
|
||||
1 rcu_barrier();
|
||||
2 srcu_barrier(&srcu_struct_1);
|
||||
3 srcu_barrier(&srcu_struct_2);
|
||||
|
||||
The rcutorture module makes use of rcu_barrier() in its exit function
|
||||
as follows::
|
||||
If latency is of the essence, workqueues could be used to run these
|
||||
three functions concurrently.
|
||||
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
5
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
||||
9 kthread_stop(shuffler_task);
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
An ancient version of the rcutorture module makes use of rcu_barrier()
|
||||
in its exit function as follows::
|
||||
|
||||
1 static void
|
||||
2 rcu_torture_cleanup(void)
|
||||
3 {
|
||||
4 int i;
|
||||
5
|
||||
6 fullstop = 1;
|
||||
7 if (shuffler_task != NULL) {
|
||||
8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
|
||||
9 kthread_stop(shuffler_task);
|
||||
10 }
|
||||
11 shuffler_task = NULL;
|
||||
12
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
13 if (writer_task != NULL) {
|
||||
14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
|
||||
15 kthread_stop(writer_task);
|
||||
16 }
|
||||
17 writer_task = NULL;
|
||||
18
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
19 if (reader_tasks != NULL) {
|
||||
20 for (i = 0; i < nrealreaders; i++) {
|
||||
21 if (reader_tasks[i] != NULL) {
|
||||
22 VERBOSE_PRINTK_STRING(
|
||||
23 "Stopping rcu_torture_reader task");
|
||||
24 kthread_stop(reader_tasks[i]);
|
||||
25 }
|
||||
26 reader_tasks[i] = NULL;
|
||||
27 }
|
||||
28 kfree(reader_tasks);
|
||||
29 reader_tasks = NULL;
|
||||
30 }
|
||||
31 rcu_torture_current = NULL;
|
||||
32
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
33 if (fakewriter_tasks != NULL) {
|
||||
34 for (i = 0; i < nfakewriters; i++) {
|
||||
35 if (fakewriter_tasks[i] != NULL) {
|
||||
36 VERBOSE_PRINTK_STRING(
|
||||
37 "Stopping rcu_torture_fakewriter task");
|
||||
38 kthread_stop(fakewriter_tasks[i]);
|
||||
39 }
|
||||
40 fakewriter_tasks[i] = NULL;
|
||||
41 }
|
||||
42 kfree(fakewriter_tasks);
|
||||
43 fakewriter_tasks = NULL;
|
||||
44 }
|
||||
45
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
46 if (stats_task != NULL) {
|
||||
47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
|
||||
48 kthread_stop(stats_task);
|
||||
49 }
|
||||
50 stats_task = NULL;
|
||||
51
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
52 /* Wait for all RCU callbacks to fire. */
|
||||
53 rcu_barrier();
|
||||
54
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
|
||||
56
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
57 if (cur_ops->cleanup != NULL)
|
||||
58 cur_ops->cleanup();
|
||||
59 if (atomic_read(&n_rcu_torture_error))
|
||||
60 rcu_torture_print_module_parms("End of test: FAILURE");
|
||||
61 else
|
||||
62 rcu_torture_print_module_parms("End of test: SUCCESS");
|
||||
63 }
|
||||
|
||||
Line 6 sets a global variable that prevents any RCU callbacks from
|
||||
re-posting themselves. This will not be necessary in most cases, since
|
||||
@@ -190,16 +169,17 @@ Quick Quiz #1:
|
||||
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
|
||||
|
||||
Your module might have additional complications. For example, if your
|
||||
module invokes call_rcu() from timers, you will need to first cancel all
|
||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||
module invokes call_rcu() from timers, you will need to first refrain
|
||||
from posting new timers, cancel (or wait for) all the already-posted
|
||||
timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||
RCU callbacks to complete.
|
||||
|
||||
Of course, if you module uses call_rcu(), you will need to invoke
|
||||
Of course, if your module uses call_rcu(), you will need to invoke
|
||||
rcu_barrier() before unloading. Similarly, if your module uses
|
||||
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||
**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
|
||||
srcu_barrier().
|
||||
**and** call_srcu(), then (as noted above) you will need to invoke
|
||||
rcu_barrier() **and** srcu_barrier().
|
||||
|
||||
|
||||
Implementing rcu_barrier()
|
||||
@@ -211,27 +191,40 @@ queues. His implementation queues an RCU callback on each of the per-CPU
|
||||
callback queues, and then waits until they have all started executing, at
|
||||
which point, all earlier RCU callbacks are guaranteed to have completed.
|
||||
|
||||
The original code for rcu_barrier() was as follows::
|
||||
The original code for rcu_barrier() was roughly as follows::
|
||||
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 0);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 wait_for_completion(&rcu_barrier_completion);
|
||||
10 mutex_unlock(&rcu_barrier_mutex);
|
||||
11 }
|
||||
1 void rcu_barrier(void)
|
||||
2 {
|
||||
3 BUG_ON(in_interrupt());
|
||||
4 /* Take cpucontrol mutex to protect against CPU hotplug */
|
||||
5 mutex_lock(&rcu_barrier_mutex);
|
||||
6 init_completion(&rcu_barrier_completion);
|
||||
7 atomic_set(&rcu_barrier_cpu_count, 1);
|
||||
8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
|
||||
9 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
10 complete(&rcu_barrier_completion);
|
||||
11 wait_for_completion(&rcu_barrier_completion);
|
||||
12 mutex_unlock(&rcu_barrier_mutex);
|
||||
13 }
|
||||
|
||||
Line 3 verifies that the caller is in process context, and lines 5 and 10
|
||||
Line 3 verifies that the caller is in process context, and lines 5 and 12
|
||||
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
|
||||
global completion and counters at a time, which are initialized on lines
|
||||
6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is
|
||||
shown below. Note that the final "1" in on_each_cpu()'s argument list
|
||||
ensures that all the calls to rcu_barrier_func() will have completed
|
||||
before on_each_cpu() returns. Line 9 then waits for the completion.
|
||||
before on_each_cpu() returns. Line 9 removes the initial count from
|
||||
rcu_barrier_cpu_count, and if this count is now zero, line 10 finalizes
|
||||
the completion, which prevents line 11 from blocking. Either way,
|
||||
line 11 then waits (if needed) for the completion.
|
||||
|
||||
.. _rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
Why doesn't line 8 initialize rcu_barrier_cpu_count to zero,
|
||||
thereby avoiding the need for lines 9 and 10?
|
||||
|
||||
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
|
||||
|
||||
This code was rewritten in 2008 and several times thereafter, but this
|
||||
still gives the general idea.
|
||||
@@ -239,21 +232,21 @@ still gives the general idea.
|
||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||
to post an RCU callback, as follows::
|
||||
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
6
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
1 static void rcu_barrier_func(void *notused)
|
||||
2 {
|
||||
3 int cpu = smp_processor_id();
|
||||
4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
|
||||
5 struct rcu_head *head;
|
||||
6
|
||||
7 head = &rdp->barrier;
|
||||
8 atomic_inc(&rcu_barrier_cpu_count);
|
||||
9 call_rcu(head, rcu_barrier_callback);
|
||||
10 }
|
||||
|
||||
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
|
||||
which contains the struct rcu_head that needed for the later call to
|
||||
call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line
|
||||
8 increments a global counter. This counter will later be decremented
|
||||
8 increments the global counter. This counter will later be decremented
|
||||
by the callback. Line 9 then registers the rcu_barrier_callback() on
|
||||
the current CPU's queue.
|
||||
|
||||
@@ -261,33 +254,34 @@ The rcu_barrier_callback() function simply atomically decrements the
|
||||
rcu_barrier_cpu_count variable and finalizes the completion when it
|
||||
reaches zero, as follows::
|
||||
|
||||
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
||||
2 {
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
5 }
|
||||
1 static void rcu_barrier_callback(struct rcu_head *notused)
|
||||
2 {
|
||||
3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
|
||||
4 complete(&rcu_barrier_completion);
|
||||
5 }
|
||||
|
||||
.. _rcubarrier_quiz_2:
|
||||
.. _rcubarrier_quiz_3:
|
||||
|
||||
Quick Quiz #2:
|
||||
Quick Quiz #3:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
rcu_barrier() returning prematurely?
|
||||
|
||||
:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
|
||||
:ref:`Answer to Quick Quiz #3 <answer_rcubarrier_quiz_3>`
|
||||
|
||||
The current rcu_barrier() implementation is more complex, due to the need
|
||||
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
||||
and the need to minimally disturb non-idle CPUs in real-time systems.
|
||||
However, the code above illustrates the concepts.
|
||||
In addition, a great many optimizations have been applied. However,
|
||||
the code above illustrates the concepts.
|
||||
|
||||
|
||||
rcu_barrier() Summary
|
||||
---------------------
|
||||
|
||||
The rcu_barrier() primitive has seen relatively little use, since most
|
||||
The rcu_barrier() primitive is used relatively infrequently, since most
|
||||
code using RCU is in the core kernel rather than in modules. However, if
|
||||
you are using RCU from an unloadable module, you need to use rcu_barrier()
|
||||
so that your module may be safely unloaded.
|
||||
@@ -302,7 +296,8 @@ Quick Quiz #1:
|
||||
Is there any other situation where rcu_barrier() might
|
||||
be required?
|
||||
|
||||
Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
Answer:
|
||||
Interestingly enough, rcu_barrier() was not originally
|
||||
implemented for module unloading. Nikita Danilov was using
|
||||
RCU in a filesystem, which resulted in a similar situation at
|
||||
filesystem-unmount time. Dipankar Sarma coded up rcu_barrier()
|
||||
@@ -318,13 +313,48 @@ Answer: Interestingly enough, rcu_barrier() was not originally
|
||||
.. _answer_rcubarrier_quiz_2:
|
||||
|
||||
Quick Quiz #2:
|
||||
Why doesn't line 8 initialize rcu_barrier_cpu_count to zero,
|
||||
thereby avoiding the need for lines 9 and 10?
|
||||
|
||||
Answer:
|
||||
Suppose that the on_each_cpu() function shown on line 8 was
|
||||
delayed, so that CPU 0's rcu_barrier_func() executed and
|
||||
the corresponding grace period elapsed, all before CPU 1's
|
||||
rcu_barrier_func() started executing. This would result in
|
||||
rcu_barrier_cpu_count being decremented to zero, so that line
|
||||
11's wait_for_completion() would return immediately, failing to
|
||||
wait for CPU 1's callbacks to be invoked.
|
||||
|
||||
Note that this was not a problem when the rcu_barrier() code
|
||||
was first added back in 2005. This is because on_each_cpu()
|
||||
disables preemption, which acted as an RCU read-side critical
|
||||
section, thus preventing CPU 0's grace period from completing
|
||||
until on_each_cpu() had dealt with all of the CPUs. However,
|
||||
with the advent of preemptible RCU, rcu_barrier() no longer
|
||||
waited on nonpreemptible regions of code in preemptible kernels,
|
||||
that being the job of the new rcu_barrier_sched() function.
|
||||
|
||||
However, with the RCU flavor consolidation around v4.20, this
|
||||
possibility was once again ruled out, because the consolidated
|
||||
RCU once again waits on nonpreemptible regions of code.
|
||||
|
||||
Nevertheless, that extra count might still be a good idea.
|
||||
Relying on these sort of accidents of implementation can result
|
||||
in later surprise bugs when the implementation changes.
|
||||
|
||||
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
|
||||
|
||||
.. _answer_rcubarrier_quiz_3:
|
||||
|
||||
Quick Quiz #3:
|
||||
What happens if CPU 0's rcu_barrier_func() executes
|
||||
immediately (thus incrementing rcu_barrier_cpu_count to the
|
||||
value one), but the other CPU's rcu_barrier_func() invocations
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
rcu_barrier() returning prematurely?
|
||||
|
||||
Answer: This cannot happen. The reason is that on_each_cpu() has its last
|
||||
Answer:
|
||||
This cannot happen. The reason is that on_each_cpu() has its last
|
||||
argument, the wait flag, set to "1". This flag is passed through
|
||||
to smp_call_function() and further to smp_call_function_on_cpu(),
|
||||
causing this latter to spin until the cross-CPU invocation of
|
||||
@@ -336,18 +366,15 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
|
||||
|
||||
Therefore, on_each_cpu() disables preemption across its call
|
||||
to smp_call_function() and also across the local call to
|
||||
rcu_barrier_func(). This prevents the local CPU from context
|
||||
switching, again preventing grace periods from completing. This
|
||||
rcu_barrier_func(). Because recent RCU implementations treat
|
||||
preemption-disabled regions of code as RCU read-side critical
|
||||
sections, this prevents grace periods from completing. This
|
||||
means that all CPUs have executed rcu_barrier_func() before
|
||||
the first rcu_barrier_callback() can possibly execute, in turn
|
||||
preventing rcu_barrier_cpu_count from prematurely reaching zero.
|
||||
|
||||
Currently, -rt implementations of RCU keep but a single global
|
||||
queue for RCU callbacks, and thus do not suffer from this
|
||||
problem. However, when the -rt RCU eventually does have per-CPU
|
||||
callback queues, things will have to change. One simple change
|
||||
is to add an rcu_read_lock() before line 8 of rcu_barrier()
|
||||
and an rcu_read_unlock() after line 8 of this same function. If
|
||||
you can think of a better change, please let me know!
|
||||
But if on_each_cpu() ever decides to forgo disabling preemption,
|
||||
as might well happen due to real-time latency considerations,
|
||||
initializing rcu_barrier_cpu_count to one will save the day.
|
||||
|
||||
:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
|
||||
:ref:`Back to Quick Quiz #3 <rcubarrier_quiz_3>`
|
||||
|
@@ -14,19 +14,19 @@ Using 'nulls'
|
||||
=============
|
||||
|
||||
Using special makers (called 'nulls') is a convenient way
|
||||
to solve following problem :
|
||||
to solve following problem.
|
||||
|
||||
A typical RCU linked list managing objects which are
|
||||
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
|
||||
use following algos :
|
||||
Without 'nulls', a typical RCU linked list managing objects which are
|
||||
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use the following
|
||||
algorithms:
|
||||
|
||||
1) Lookup algo
|
||||
--------------
|
||||
1) Lookup algorithm
|
||||
-------------------
|
||||
|
||||
::
|
||||
|
||||
rcu_read_lock()
|
||||
begin:
|
||||
rcu_read_lock()
|
||||
obj = lockless_lookup(key);
|
||||
if (obj) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
@@ -38,6 +38,7 @@ use following algos :
|
||||
*/
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
rcu_read_unlock();
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
@@ -52,9 +53,9 @@ but a version with an additional memory barrier (smp_rmb())
|
||||
{
|
||||
struct hlist_node *node, *next;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(next))
|
||||
pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
@@ -64,9 +65,9 @@ And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
|
||||
|
||||
struct hlist_node *node;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ prefetch(pos->next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(pos->next))
|
||||
pos && ({ prefetch(pos->next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(pos->next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
@@ -82,36 +83,32 @@ Quoting Corey Minyard::
|
||||
solved by pre-fetching the "next" field (with proper barriers) before
|
||||
checking the key."
|
||||
|
||||
2) Insert algo
|
||||
--------------
|
||||
2) Insertion algorithm
|
||||
----------------------
|
||||
|
||||
We need to make sure a reader cannot read the new 'obj->obj_next' value
|
||||
and previous value of 'obj->key'. Or else, an item could be deleted
|
||||
and previous value of 'obj->key'. Otherwise, an item could be deleted
|
||||
from a chain, and inserted into another chain. If new chain was empty
|
||||
before the move, 'next' pointer is NULL, and lockless reader can
|
||||
not detect it missed following items in original chain.
|
||||
before the move, 'next' pointer is NULL, and lockless reader can not
|
||||
detect the fact that it missed following items in original chain.
|
||||
|
||||
::
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(...);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
/*
|
||||
* we need to make sure obj->key is updated before obj->next
|
||||
* or obj->refcnt
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
atomic_set_release(&obj->refcnt, 1); // key before refcnt
|
||||
hlist_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
|
||||
|
||||
3) Remove algo
|
||||
--------------
|
||||
3) Removal algorithm
|
||||
--------------------
|
||||
|
||||
Nothing special here, we can use a standard RCU hlist deletion.
|
||||
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
|
||||
very very fast (before the end of RCU grace period)
|
||||
@@ -133,7 +130,7 @@ Avoiding extra smp_rmb()
|
||||
========================
|
||||
|
||||
With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
|
||||
and extra smp_wmb() in insert function.
|
||||
and extra _release() in insert function.
|
||||
|
||||
For example, if we choose to store the slot number as the 'nulls'
|
||||
end-of-list marker for each slot of the hash table, we can detect
|
||||
@@ -142,59 +139,61 @@ to another chain) checking the final 'nulls' value if
|
||||
the lookup met the end of chain. If final 'nulls' value
|
||||
is not the slot number, then we must restart the lookup at
|
||||
the beginning. If the object was moved to the same chain,
|
||||
then the reader doesn't care : It might eventually
|
||||
then the reader doesn't care: It might occasionally
|
||||
scan the list again without harm.
|
||||
|
||||
|
||||
1) lookup algo
|
||||
--------------
|
||||
1) lookup algorithm
|
||||
-------------------
|
||||
|
||||
::
|
||||
|
||||
head = &table[slot];
|
||||
rcu_read_lock();
|
||||
begin:
|
||||
rcu_read_lock();
|
||||
hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
|
||||
if (obj->key == key) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
goto begin;
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
if (!try_get_ref(obj)) { // might fail for free objects
|
||||
rcu_read_unlock();
|
||||
goto begin;
|
||||
}
|
||||
goto out;
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
rcu_read_unlock();
|
||||
goto begin;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
// If the nulls value we got at the end of this lookup is
|
||||
// not the expected one, we must restart lookup.
|
||||
// We probably met an item that was moved to another chain.
|
||||
if (get_nulls_value(node) != slot) {
|
||||
put_ref(obj);
|
||||
rcu_read_unlock();
|
||||
goto begin;
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto begin;
|
||||
obj = NULL;
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
2) Insert function
|
||||
------------------
|
||||
2) Insert algorithm
|
||||
-------------------
|
||||
|
||||
::
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(cachep);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
atomic_set_release(&obj->refcnt, 1); // key before refcnt
|
||||
/*
|
||||
* changes to obj->key must be visible before refcnt one
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
/*
|
||||
* insert obj in RCU way (readers might be traversing chain)
|
||||
*/
|
||||
* insert obj in RCU way (readers might be traversing chain)
|
||||
*/
|
||||
hlist_nulls_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
|
@@ -25,10 +25,10 @@ warnings:
|
||||
|
||||
- A CPU looping with bottom halves disabled.
|
||||
|
||||
- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel
|
||||
without invoking schedule(). If the looping in the kernel is
|
||||
really expected and desirable behavior, you might need to add
|
||||
some calls to cond_resched().
|
||||
- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the
|
||||
kernel without potentially invoking schedule(). If the looping
|
||||
in the kernel is really expected and desirable behavior, you
|
||||
might need to add some calls to cond_resched().
|
||||
|
||||
- Booting Linux using a console connection that is too slow to
|
||||
keep up with the boot-time console-message rate. For example,
|
||||
@@ -108,16 +108,17 @@ warnings:
|
||||
|
||||
- A bug in the RCU implementation.
|
||||
|
||||
- A hardware failure. This is quite unlikely, but has occurred
|
||||
at least once in real life. A CPU failed in a running system,
|
||||
becoming unresponsive, but not causing an immediate crash.
|
||||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
leading the realization that the CPU had failed.
|
||||
- A hardware failure. This is quite unlikely, but is not at all
|
||||
uncommon in large datacenter. In one memorable case some decades
|
||||
back, a CPU failed in a running system, becoming unresponsive,
|
||||
but not causing an immediate crash. This resulted in a series
|
||||
of RCU CPU stall warnings, eventually leading the realization
|
||||
that the CPU had failed.
|
||||
|
||||
The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
|
||||
Note that SRCU does *not* have CPU stall warnings. Please note that
|
||||
RCU only detects CPU stalls when there is a grace period in progress.
|
||||
No grace period, no CPU stall warnings.
|
||||
The RCU, RCU-sched, RCU-tasks, and RCU-tasks-trace implementations have
|
||||
CPU stall warning. Note that SRCU does *not* have CPU stall warnings.
|
||||
Please note that RCU only detects CPU stalls when there is a grace period
|
||||
in progress. No grace period, no CPU stall warnings.
|
||||
|
||||
To diagnose the cause of the stall, inspect the stack traces.
|
||||
The offending function will usually be near the top of the stack.
|
||||
@@ -205,16 +206,21 @@ RCU_STALL_RAT_DELAY
|
||||
rcupdate.rcu_task_stall_timeout
|
||||
-------------------------------
|
||||
|
||||
This boot/sysfs parameter controls the RCU-tasks stall warning
|
||||
interval. A value of zero or less suppresses RCU-tasks stall
|
||||
warnings. A positive value sets the stall-warning interval
|
||||
in seconds. An RCU-tasks stall warning starts with the line:
|
||||
This boot/sysfs parameter controls the RCU-tasks and
|
||||
RCU-tasks-trace stall warning intervals. A value of zero or less
|
||||
suppresses RCU-tasks stall warnings. A positive value sets the
|
||||
stall-warning interval in seconds. An RCU-tasks stall warning
|
||||
starts with the line:
|
||||
|
||||
INFO: rcu_tasks detected stalls on tasks:
|
||||
|
||||
And continues with the output of sched_show_task() for each
|
||||
task stalling the current RCU-tasks grace period.
|
||||
|
||||
An RCU-tasks-trace stall warning starts (and continues) similarly:
|
||||
|
||||
INFO: rcu_tasks_trace detected stalls on tasks
|
||||
|
||||
|
||||
Interpreting RCU's CPU Stall-Detector "Splats"
|
||||
==============================================
|
||||
@@ -248,7 +254,8 @@ dynticks counter, which will have an even-numbered value if the CPU
|
||||
is in dyntick-idle mode and an odd-numbered value otherwise. The hex
|
||||
number between the two "/"s is the value of the nesting, which will be
|
||||
a small non-negative number if in the idle loop (as shown above) and a
|
||||
very large positive number otherwise.
|
||||
very large positive number otherwise. The number following the final
|
||||
"/" is the NMI nesting, which will be a small non-negative number.
|
||||
|
||||
The "softirq=" portion of the message tracks the number of RCU softirq
|
||||
handlers that the stalled CPU has executed. The number before the "/"
|
||||
@@ -383,3 +390,95 @@ for example, "P3421".
|
||||
|
||||
It is entirely possible to see stall warnings from normal and from
|
||||
expedited grace periods at about the same time during the same run.
|
||||
|
||||
RCU_CPU_STALL_CPUTIME
|
||||
=====================
|
||||
|
||||
In kernels built with CONFIG_RCU_CPU_STALL_CPUTIME=y or booted with
|
||||
rcupdate.rcu_cpu_stall_cputime=1, the following additional information
|
||||
is supplied with each RCU CPU stall warning::
|
||||
|
||||
rcu: hardirqs softirqs csw/system
|
||||
rcu: number: 624 45 0
|
||||
rcu: cputime: 69 1 2425 ==> 2500(ms)
|
||||
|
||||
These statistics are collected during the sampling period. The values
|
||||
in row "number:" are the number of hard interrupts, number of soft
|
||||
interrupts, and number of context switches on the stalled CPU. The
|
||||
first three values in row "cputime:" indicate the CPU time in
|
||||
milliseconds consumed by hard interrupts, soft interrupts, and tasks
|
||||
on the stalled CPU. The last number is the measurement interval, again
|
||||
in milliseconds. Because user-mode tasks normally do not cause RCU CPU
|
||||
stalls, these tasks are typically kernel tasks, which is why only the
|
||||
system CPU time are considered.
|
||||
|
||||
The sampling period is shown as follows::
|
||||
|
||||
|<------------first timeout---------->|<-----second timeout----->|
|
||||
|<--half timeout-->|<--half timeout-->| |
|
||||
| |<--first period-->| |
|
||||
| |<-----------second sampling period---------->|
|
||||
| | | |
|
||||
snapshot time point 1st-stall 2nd-stall
|
||||
|
||||
The following describes four typical scenarios:
|
||||
|
||||
1. A CPU looping with interrupts disabled.
|
||||
|
||||
::
|
||||
|
||||
rcu: hardirqs softirqs csw/system
|
||||
rcu: number: 0 0 0
|
||||
rcu: cputime: 0 0 0 ==> 2500(ms)
|
||||
|
||||
Because interrupts have been disabled throughout the measurement
|
||||
interval, there are no interrupts and no context switches.
|
||||
Furthermore, because CPU time consumption was measured using interrupt
|
||||
handlers, the system CPU consumption is misleadingly measured as zero.
|
||||
This scenario will normally also have "(0 ticks this GP)" printed on
|
||||
this CPU's summary line.
|
||||
|
||||
2. A CPU looping with bottom halves disabled.
|
||||
|
||||
This is similar to the previous example, but with non-zero number of
|
||||
and CPU time consumed by hard interrupts, along with non-zero CPU
|
||||
time consumed by in-kernel execution::
|
||||
|
||||
rcu: hardirqs softirqs csw/system
|
||||
rcu: number: 624 0 0
|
||||
rcu: cputime: 49 0 2446 ==> 2500(ms)
|
||||
|
||||
The fact that there are zero softirqs gives a hint that these were
|
||||
disabled, perhaps via local_bh_disable(). It is of course possible
|
||||
that there were no softirqs, perhaps because all events that would
|
||||
result in softirq execution are confined to other CPUs. In this case,
|
||||
the diagnosis should continue as shown in the next example.
|
||||
|
||||
3. A CPU looping with preemption disabled.
|
||||
|
||||
Here, only the number of context switches is zero::
|
||||
|
||||
rcu: hardirqs softirqs csw/system
|
||||
rcu: number: 624 45 0
|
||||
rcu: cputime: 69 1 2425 ==> 2500(ms)
|
||||
|
||||
This situation hints that the stalled CPU was looping with preemption
|
||||
disabled.
|
||||
|
||||
4. No looping, but massive hard and soft interrupts.
|
||||
|
||||
::
|
||||
|
||||
rcu: hardirqs softirqs csw/system
|
||||
rcu: number: xx xx 0
|
||||
rcu: cputime: xx xx 0 ==> 2500(ms)
|
||||
|
||||
Here, the number and CPU time of hard interrupts are all non-zero,
|
||||
but the number of context switches and the in-kernel CPU time consumed
|
||||
are zero. The number and cputime of soft interrupts will usually be
|
||||
non-zero, but could be zero, for example, if the CPU was spinning
|
||||
within a single hard interrupt handler.
|
||||
|
||||
If this type of RCU CPU stall warning can be reproduced, you can
|
||||
narrow it down by looking at /proc/interrupts or by writing code to
|
||||
trace each interrupt, for example, by referring to show_interrupts().
|
||||
|
@@ -206,7 +206,11 @@ values for memory may require disabling the callback-flooding tests
|
||||
using the --bootargs parameter discussed below.
|
||||
|
||||
Sometimes additional debugging is useful, and in such cases the --kconfig
|
||||
parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``.
|
||||
parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_RCU_EQS_DEBUG=y'``.
|
||||
In addition, there are the --gdb, --kasan, and --kcsan parameters.
|
||||
Note that --gdb limits you to one scenario per kvm.sh run and requires
|
||||
that you have another window open from which to run ``gdb`` as instructed
|
||||
by the script.
|
||||
|
||||
Kernel boot arguments can also be supplied, for example, to control
|
||||
rcutorture's module parameters. For example, to test a change to RCU's
|
||||
@@ -219,10 +223,17 @@ require disabling rcutorture's callback-flooding tests::
|
||||
--bootargs 'rcutorture.fwd_progress=0'
|
||||
|
||||
Sometimes all that is needed is a full set of kernel builds. This is
|
||||
what the --buildonly argument does.
|
||||
what the --buildonly parameter does.
|
||||
|
||||
Finally, the --trust-make argument allows each kernel build to reuse what
|
||||
it can from the previous kernel build.
|
||||
The --duration parameter can override the default run time of 30 minutes.
|
||||
For example, ``--duration 2d`` would run for two days, ``--duration 3h``
|
||||
would run for three hours, ``--duration 5m`` would run for five minutes,
|
||||
and ``--duration 45s`` would run for 45 seconds. This last can be useful
|
||||
for tracking down rare boot-time failures.
|
||||
|
||||
Finally, the --trust-make parameter allows each kernel build to reuse what
|
||||
it can from the previous kernel build. Please note that without the
|
||||
--trust-make parameter, your tags files may be demolished.
|
||||
|
||||
There are additional more arcane arguments that are documented in the
|
||||
source code of the kvm.sh script.
|
||||
@@ -291,3 +302,73 @@ the following summary at the end of the run on a 12-CPU system::
|
||||
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
|
||||
CPU count limited from 16 to 12
|
||||
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
|
||||
|
||||
|
||||
Repeated Runs
|
||||
=============
|
||||
|
||||
Suppose that you are chasing down a rare boot-time failure. Although you
|
||||
could use kvm.sh, doing so will rebuild the kernel on each run. If you
|
||||
need (say) 1,000 runs to have confidence that you have fixed the bug,
|
||||
these pointless rebuilds can become extremely annoying.
|
||||
|
||||
This is why kvm-again.sh exists.
|
||||
|
||||
Suppose that a previous kvm.sh run left its output in this directory::
|
||||
|
||||
tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
|
||||
|
||||
Then this run can be re-run without rebuilding as follow:
|
||||
|
||||
kvm-again.sh tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
|
||||
|
||||
A few of the original run's kvm.sh parameters may be overridden, perhaps
|
||||
most notably --duration and --bootargs. For example::
|
||||
|
||||
kvm-again.sh tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28 \
|
||||
--duration 45s
|
||||
|
||||
would re-run the previous test, but for only 45 seconds, thus facilitating
|
||||
tracking down the aforementioned rare boot-time failure.
|
||||
|
||||
|
||||
Distributed Runs
|
||||
================
|
||||
|
||||
Although kvm.sh is quite useful, its testing is confined to a single
|
||||
system. It is not all that hard to use your favorite framework to cause
|
||||
(say) 5 instances of kvm.sh to run on your 5 systems, but this will very
|
||||
likely unnecessarily rebuild kernels. In addition, manually distributing
|
||||
the desired rcutorture scenarios across the available systems can be
|
||||
painstaking and error-prone.
|
||||
|
||||
And this is why the kvm-remote.sh script exists.
|
||||
|
||||
If you the following command works::
|
||||
|
||||
ssh system0 date
|
||||
|
||||
and if it also works for system1, system2, system3, system4, and system5,
|
||||
and all of these systems have 64 CPUs, you can type::
|
||||
|
||||
kvm-remote.sh "system0 system1 system2 system3 system4 system5" \
|
||||
--cpus 64 --duration 8h --configs "5*CFLIST"
|
||||
|
||||
This will build each default scenario's kernel on the local system, then
|
||||
spread each of five instances of each scenario over the systems listed,
|
||||
running each scenario for eight hours. At the end of the runs, the
|
||||
results will be gathered, recorded, and printed. Most of the parameters
|
||||
that kvm.sh will accept can be passed to kvm-remote.sh, but the list of
|
||||
systems must come first.
|
||||
|
||||
The kvm.sh ``--dryrun scenarios`` argument is useful for working out
|
||||
how many scenarios may be run in one batch across a group of systems.
|
||||
|
||||
You can also re-run a previous remote run in a manner similar to kvm.sh:
|
||||
|
||||
kvm-remote.sh "system0 system1 system2 system3 system4 system5" \
|
||||
tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28-remote \
|
||||
--duration 24h
|
||||
|
||||
In this case, most of the kvm-again.sh parmeters may be supplied following
|
||||
the pathname of the old run-results directory.
|
||||
|
@@ -16,18 +16,23 @@ to start learning about RCU:
|
||||
| 6. The RCU API, 2019 Edition https://lwn.net/Articles/777036/
|
||||
| 2019 Big API Table https://lwn.net/Articles/777165/
|
||||
|
||||
For those preferring video:
|
||||
|
||||
| 1. Unraveling RCU Mysteries: Fundamentals https://www.linuxfoundation.org/webinars/unraveling-rcu-usage-mysteries
|
||||
| 2. Unraveling RCU Mysteries: Additional Use Cases https://www.linuxfoundation.org/webinars/unraveling-rcu-usage-mysteries-additional-use-cases
|
||||
|
||||
|
||||
What is RCU?
|
||||
|
||||
RCU is a synchronization mechanism that was added to the Linux kernel
|
||||
during the 2.5 development effort that is optimized for read-mostly
|
||||
situations. Although RCU is actually quite simple once you understand it,
|
||||
getting there can sometimes be a challenge. Part of the problem is that
|
||||
most of the past descriptions of RCU have been written with the mistaken
|
||||
assumption that there is "one true way" to describe RCU. Instead,
|
||||
the experience has been that different people must take different paths
|
||||
to arrive at an understanding of RCU. This document provides several
|
||||
different paths, as follows:
|
||||
situations. Although RCU is actually quite simple, making effective use
|
||||
of it requires you to think differently about your code. Another part
|
||||
of the problem is the mistaken assumption that there is "one true way" to
|
||||
describe and to use RCU. Instead, the experience has been that different
|
||||
people must take different paths to arrive at an understanding of RCU,
|
||||
depending on their experiences and use cases. This document provides
|
||||
several different paths, as follows:
|
||||
|
||||
:ref:`1. RCU OVERVIEW <1_whatisRCU>`
|
||||
|
||||
@@ -157,34 +162,36 @@ rcu_read_lock()
|
||||
^^^^^^^^^^^^^^^
|
||||
void rcu_read_lock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
entering an RCU read-side critical section. It is illegal
|
||||
to block while in an RCU read-side critical section, though
|
||||
kernels built with CONFIG_PREEMPT_RCU can preempt RCU
|
||||
read-side critical sections. Any RCU-protected data structure
|
||||
accessed during an RCU read-side critical section is guaranteed to
|
||||
remain unreclaimed for the full duration of that critical section.
|
||||
Reference counts may be used in conjunction with RCU to maintain
|
||||
longer-term references to data structures.
|
||||
This temporal primitive is used by a reader to inform the
|
||||
reclaimer that the reader is entering an RCU read-side critical
|
||||
section. It is illegal to block while in an RCU read-side
|
||||
critical section, though kernels built with CONFIG_PREEMPT_RCU
|
||||
can preempt RCU read-side critical sections. Any RCU-protected
|
||||
data structure accessed during an RCU read-side critical section
|
||||
is guaranteed to remain unreclaimed for the full duration of that
|
||||
critical section. Reference counts may be used in conjunction
|
||||
with RCU to maintain longer-term references to data structures.
|
||||
|
||||
rcu_read_unlock()
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void rcu_read_unlock(void);
|
||||
|
||||
Used by a reader to inform the reclaimer that the reader is
|
||||
exiting an RCU read-side critical section. Note that RCU
|
||||
read-side critical sections may be nested and/or overlapping.
|
||||
This temporal primitives is used by a reader to inform the
|
||||
reclaimer that the reader is exiting an RCU read-side critical
|
||||
section. Note that RCU read-side critical sections may be nested
|
||||
and/or overlapping.
|
||||
|
||||
synchronize_rcu()
|
||||
^^^^^^^^^^^^^^^^^
|
||||
void synchronize_rcu(void);
|
||||
|
||||
Marks the end of updater code and the beginning of reclaimer
|
||||
code. It does this by blocking until all pre-existing RCU
|
||||
read-side critical sections on all CPUs have completed.
|
||||
Note that synchronize_rcu() will **not** necessarily wait for
|
||||
any subsequent RCU read-side critical sections to complete.
|
||||
For example, consider the following sequence of events::
|
||||
This temporal primitive marks the end of updater code and the
|
||||
beginning of reclaimer code. It does this by blocking until
|
||||
all pre-existing RCU read-side critical sections on all CPUs
|
||||
have completed. Note that synchronize_rcu() will **not**
|
||||
necessarily wait for any subsequent RCU read-side critical
|
||||
sections to complete. For example, consider the following
|
||||
sequence of events::
|
||||
|
||||
CPU 0 CPU 1 CPU 2
|
||||
----------------- ------------------------- ---------------
|
||||
@@ -211,13 +218,13 @@ synchronize_rcu()
|
||||
to be useful in all but the most read-intensive situations,
|
||||
synchronize_rcu()'s overhead must also be quite small.
|
||||
|
||||
The call_rcu() API is a callback form of synchronize_rcu(),
|
||||
and is described in more detail in a later section. Instead of
|
||||
blocking, it registers a function and argument which are invoked
|
||||
after all ongoing RCU read-side critical sections have completed.
|
||||
This callback variant is particularly useful in situations where
|
||||
it is illegal to block or where update-side performance is
|
||||
critically important.
|
||||
The call_rcu() API is an asynchronous callback form of
|
||||
synchronize_rcu(), and is described in more detail in a later
|
||||
section. Instead of blocking, it registers a function and
|
||||
argument which are invoked after all ongoing RCU read-side
|
||||
critical sections have completed. This callback variant is
|
||||
particularly useful in situations where it is illegal to block
|
||||
or where update-side performance is critically important.
|
||||
|
||||
However, the call_rcu() API should not be used lightly, as use
|
||||
of the synchronize_rcu() API generally results in simpler code.
|
||||
@@ -236,11 +243,13 @@ rcu_assign_pointer()
|
||||
would be cool to be able to declare a function in this manner.
|
||||
(Compiler experts will no doubt disagree.)
|
||||
|
||||
The updater uses this function to assign a new value to an
|
||||
The updater uses this spatial macro to assign a new value to an
|
||||
RCU-protected pointer, in order to safely communicate the change
|
||||
in value from the updater to the reader. This macro does not
|
||||
evaluate to an rvalue, but it does execute any memory-barrier
|
||||
instructions required for a given CPU architecture.
|
||||
in value from the updater to the reader. This is a spatial (as
|
||||
opposed to temporal) macro. It does not evaluate to an rvalue,
|
||||
but it does execute any memory-barrier instructions required
|
||||
for a given CPU architecture. Its ordering properties are that
|
||||
of a store-release operation.
|
||||
|
||||
Perhaps just as important, it serves to document (1) which
|
||||
pointers are protected by RCU and (2) the point at which a
|
||||
@@ -255,14 +264,15 @@ rcu_dereference()
|
||||
Like rcu_assign_pointer(), rcu_dereference() must be implemented
|
||||
as a macro.
|
||||
|
||||
The reader uses rcu_dereference() to fetch an RCU-protected
|
||||
pointer, which returns a value that may then be safely
|
||||
dereferenced. Note that rcu_dereference() does not actually
|
||||
dereference the pointer, instead, it protects the pointer for
|
||||
later dereferencing. It also executes any needed memory-barrier
|
||||
instructions for a given CPU architecture. Currently, only Alpha
|
||||
needs memory barriers within rcu_dereference() -- on other CPUs,
|
||||
it compiles to nothing, not even a compiler directive.
|
||||
The reader uses the spatial rcu_dereference() macro to fetch
|
||||
an RCU-protected pointer, which returns a value that may
|
||||
then be safely dereferenced. Note that rcu_dereference()
|
||||
does not actually dereference the pointer, instead, it
|
||||
protects the pointer for later dereferencing. It also
|
||||
executes any needed memory-barrier instructions for a given
|
||||
CPU architecture. Currently, only Alpha needs memory barriers
|
||||
within rcu_dereference() -- on other CPUs, it compiles to a
|
||||
volatile load.
|
||||
|
||||
Common coding practice uses rcu_dereference() to copy an
|
||||
RCU-protected pointer to a local variable, then dereferences
|
||||
@@ -355,12 +365,15 @@ reader, updater, and reclaimer.
|
||||
synchronize_rcu() & call_rcu()
|
||||
|
||||
|
||||
The RCU infrastructure observes the time sequence of rcu_read_lock(),
|
||||
The RCU infrastructure observes the temporal sequence of rcu_read_lock(),
|
||||
rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in
|
||||
order to determine when (1) synchronize_rcu() invocations may return
|
||||
to their callers and (2) call_rcu() callbacks may be invoked. Efficient
|
||||
implementations of the RCU infrastructure make heavy use of batching in
|
||||
order to amortize their overhead over many uses of the corresponding APIs.
|
||||
The rcu_assign_pointer() and rcu_dereference() invocations communicate
|
||||
spatial changes via stores to and loads from the RCU-protected pointer in
|
||||
question.
|
||||
|
||||
There are at least three flavors of RCU usage in the Linux kernel. The diagram
|
||||
above shows the most common one. On the updater side, the rcu_assign_pointer(),
|
||||
@@ -392,7 +405,9 @@ b. RCU applied to networking data structures that may be subjected
|
||||
c. RCU applied to scheduler and interrupt/NMI-handler tasks.
|
||||
|
||||
Again, most uses will be of (a). The (b) and (c) cases are important
|
||||
for specialized uses, but are relatively uncommon.
|
||||
for specialized uses, but are relatively uncommon. The SRCU, RCU-Tasks,
|
||||
RCU-Tasks-Rude, and RCU-Tasks-Trace have similar relationships among
|
||||
their assorted primitives.
|
||||
|
||||
.. _3_whatisRCU:
|
||||
|
||||
@@ -468,7 +483,7 @@ So, to sum up:
|
||||
- Within an RCU read-side critical section, use rcu_dereference()
|
||||
to dereference RCU-protected pointers.
|
||||
|
||||
- Use some solid scheme (such as locks or semaphores) to
|
||||
- Use some solid design (such as locks or semaphores) to
|
||||
keep concurrent updates from interfering with each other.
|
||||
|
||||
- Use rcu_assign_pointer() to update an RCU-protected pointer.
|
||||
@@ -579,6 +594,14 @@ to avoid having to write your own callback::
|
||||
|
||||
kfree_rcu(old_fp, rcu);
|
||||
|
||||
If the occasional sleep is permitted, the single-argument form may
|
||||
be used, omitting the rcu_head structure from struct foo.
|
||||
|
||||
kfree_rcu(old_fp);
|
||||
|
||||
This variant of kfree_rcu() almost never blocks, but might do so by
|
||||
invoking synchronize_rcu() in response to memory-allocation failure.
|
||||
|
||||
Again, see checklist.rst for additional rules governing the use of RCU.
|
||||
|
||||
.. _5_whatisRCU:
|
||||
@@ -596,7 +619,7 @@ lacking both functionality and performance. However, they are useful
|
||||
in getting a feel for how RCU works. See kernel/rcu/update.c for a
|
||||
production-quality implementation, and see:
|
||||
|
||||
http://www.rdrop.com/users/paulmck/RCU
|
||||
https://docs.google.com/document/d/1X0lThx8OK0ZgLMqVoXiR4ZrGURHrXK6NyLRbeXe3Xac/edit
|
||||
|
||||
for papers describing the Linux kernel RCU implementation. The OLS'01
|
||||
and OLS'02 papers are a good introduction, and the dissertation provides
|
||||
@@ -929,6 +952,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
|
||||
initialized after each and every call to kmem_cache_alloc(), which renders
|
||||
reference-free spinlock acquisition completely unsafe. Therefore, when
|
||||
using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
|
||||
(Those willing to use a kmem_cache constructor may also use locking,
|
||||
including cache-friendly sequence locking.)
|
||||
|
||||
With traditional reference counting -- such as that implemented by the
|
||||
kref library in Linux -- there is typically code that runs when the last
|
||||
@@ -1047,6 +1072,30 @@ sched::
|
||||
rcu_read_lock_sched_held
|
||||
|
||||
|
||||
RCU-Tasks::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
N/A call_rcu_tasks rcu_barrier_tasks
|
||||
synchronize_rcu_tasks
|
||||
|
||||
|
||||
RCU-Tasks-Rude::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
N/A call_rcu_tasks_rude rcu_barrier_tasks_rude
|
||||
synchronize_rcu_tasks_rude
|
||||
|
||||
|
||||
RCU-Tasks-Trace::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_trace call_rcu_tasks_trace rcu_barrier_tasks_trace
|
||||
rcu_read_unlock_trace synchronize_rcu_tasks_trace
|
||||
|
||||
|
||||
SRCU::
|
||||
|
||||
Critical sections Grace period Barrier
|
||||
@@ -1087,35 +1136,43 @@ list can be helpful:
|
||||
|
||||
a. Will readers need to block? If so, you need SRCU.
|
||||
|
||||
b. What about the -rt patchset? If readers would need to block
|
||||
in an non-rt kernel, you need SRCU. If readers would block
|
||||
in a -rt kernel, but not in a non-rt kernel, SRCU is not
|
||||
necessary. (The -rt patchset turns spinlocks into sleeplocks,
|
||||
hence this distinction.)
|
||||
b. Will readers need to block and are you doing tracing, for
|
||||
example, ftrace or BPF? If so, you need RCU-tasks,
|
||||
RCU-tasks-rude, and/or RCU-tasks-trace.
|
||||
|
||||
c. Do you need to treat NMI handlers, hardirq handlers,
|
||||
c. What about the -rt patchset? If readers would need to block in
|
||||
an non-rt kernel, you need SRCU. If readers would block when
|
||||
acquiring spinlocks in a -rt kernel, but not in a non-rt kernel,
|
||||
SRCU is not necessary. (The -rt patchset turns spinlocks into
|
||||
sleeplocks, hence this distinction.)
|
||||
|
||||
d. Do you need to treat NMI handlers, hardirq handlers,
|
||||
and code segments with preemption disabled (whether
|
||||
via preempt_disable(), local_irq_save(), local_bh_disable(),
|
||||
or some other mechanism) as if they were explicit RCU readers?
|
||||
If so, RCU-sched is the only choice that will work for you.
|
||||
If so, RCU-sched readers are the only choice that will work
|
||||
for you, but since about v4.20 you use can use the vanilla RCU
|
||||
update primitives.
|
||||
|
||||
d. Do you need RCU grace periods to complete even in the face
|
||||
of softirq monopolization of one or more of the CPUs? For
|
||||
example, is your code subject to network-based denial-of-service
|
||||
attacks? If so, you should disable softirq across your readers,
|
||||
for example, by using rcu_read_lock_bh().
|
||||
e. Do you need RCU grace periods to complete even in the face of
|
||||
softirq monopolization of one or more of the CPUs? For example,
|
||||
is your code subject to network-based denial-of-service attacks?
|
||||
If so, you should disable softirq across your readers, for
|
||||
example, by using rcu_read_lock_bh(). Since about v4.20 you
|
||||
use can use the vanilla RCU update primitives.
|
||||
|
||||
e. Is your workload too update-intensive for normal use of
|
||||
f. Is your workload too update-intensive for normal use of
|
||||
RCU, but inappropriate for other synchronization mechanisms?
|
||||
If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
|
||||
named SLAB_DESTROY_BY_RCU). But please be careful!
|
||||
|
||||
f. Do you need read-side critical sections that are respected
|
||||
even though they are in the middle of the idle loop, during
|
||||
user-mode execution, or on an offlined CPU? If so, SRCU is the
|
||||
only choice that will work for you.
|
||||
g. Do you need read-side critical sections that are respected even
|
||||
on CPUs that are deep in the idle loop, during entry to or exit
|
||||
from user-mode execution, or on an offlined CPU? If so, SRCU
|
||||
and RCU Tasks Trace are the only choices that will work for you,
|
||||
with SRCU being strongly preferred in almost all cases.
|
||||
|
||||
g. Otherwise, use RCU.
|
||||
h. Otherwise, use RCU.
|
||||
|
||||
Of course, this all assumes that you have determined that RCU is in fact
|
||||
the right tool for your job.
|
||||
|
@@ -67,9 +67,9 @@ tree - drivers/accel/.
|
||||
The accelerator devices will be exposed to the user space with the dedicated
|
||||
261 major number and will have the following convention:
|
||||
|
||||
- device char files - /dev/accel/accel*
|
||||
- sysfs - /sys/class/accel/accel*/
|
||||
- debugfs - /sys/kernel/debug/accel/accel*/
|
||||
- device char files - /dev/accel/accel\*
|
||||
- sysfs - /sys/class/accel/accel\*/
|
||||
- debugfs - /sys/kernel/debug/accel/\*/
|
||||
|
||||
Getting Started
|
||||
===============
|
||||
|
@@ -204,7 +204,7 @@ For example::
|
||||
This should present your unmodified backing device data in /dev/loop0
|
||||
|
||||
If your cache is in writethrough mode, then you can safely discard the
|
||||
cache device without loosing data.
|
||||
cache device without losing data.
|
||||
|
||||
|
||||
E) Wiping a cache device
|
||||
|
@@ -3,6 +3,7 @@ Linux and parallel port IDE devices
|
||||
===================================
|
||||
|
||||
PARIDE v1.03 (c) 1997-8 Grant Guenther <grant@torque.net>
|
||||
PATA_PARPORT (c) 2023 Ondrej Zary
|
||||
|
||||
1. Introduction
|
||||
===============
|
||||
@@ -51,27 +52,15 @@ parallel port IDE subsystem, including:
|
||||
|
||||
as well as most of the clone and no-name products on the market.
|
||||
|
||||
To support such a wide range of devices, PARIDE, the parallel port IDE
|
||||
subsystem, is actually structured in three parts. There is a base
|
||||
paride module which provides a registry and some common methods for
|
||||
accessing the parallel ports. The second component is a set of
|
||||
high-level drivers for each of the different types of supported devices:
|
||||
To support such a wide range of devices, pata_parport is actually structured
|
||||
in two parts. There is a base pata_parport module which provides an interface
|
||||
to kernel libata subsystem, registry and some common methods for accessing
|
||||
the parallel ports.
|
||||
|
||||
=== =============
|
||||
pd IDE disk
|
||||
pcd ATAPI CD-ROM
|
||||
pf ATAPI disk
|
||||
pt ATAPI tape
|
||||
pg ATAPI generic
|
||||
=== =============
|
||||
|
||||
(Currently, the pg driver is only used with CD-R drives).
|
||||
|
||||
The high-level drivers function according to the relevant standards.
|
||||
The third component of PARIDE is a set of low-level protocol drivers
|
||||
for each of the parallel port IDE adapter chips. Thanks to the interest
|
||||
and encouragement of Linux users from many parts of the world,
|
||||
support is available for almost all known adapter protocols:
|
||||
The second component is a set of low-level protocol drivers for each of the
|
||||
parallel port IDE adapter chips. Thanks to the interest and encouragement of
|
||||
Linux users from many parts of the world, support is available for almost all
|
||||
known adapter protocols:
|
||||
|
||||
==== ====================================== ====
|
||||
aten ATEN EH-100 (HK)
|
||||
@@ -91,251 +80,87 @@ support is available for almost all known adapter protocols:
|
||||
==== ====================================== ====
|
||||
|
||||
|
||||
2. Using the PARIDE subsystem
|
||||
=============================
|
||||
2. Using pata_parport subsystem
|
||||
===============================
|
||||
|
||||
While configuring the Linux kernel, you may choose either to build
|
||||
the PARIDE drivers into your kernel, or to build them as modules.
|
||||
the pata_parport drivers into your kernel, or to build them as modules.
|
||||
|
||||
In either case, you will need to select "Parallel port IDE device support"
|
||||
as well as at least one of the high-level drivers and at least one
|
||||
of the parallel port communication protocols. If you do not know
|
||||
what kind of parallel port adapter is used in your drive, you could
|
||||
begin by checking the file names and any text files on your DOS
|
||||
and at least one of the parallel port communication protocols.
|
||||
If you do not know what kind of parallel port adapter is used in your drive,
|
||||
you could begin by checking the file names and any text files on your DOS
|
||||
installation floppy. Alternatively, you can look at the markings on
|
||||
the adapter chip itself. That's usually sufficient to identify the
|
||||
correct device.
|
||||
|
||||
You can actually select all the protocol modules, and allow the PARIDE
|
||||
You can actually select all the protocol modules, and allow the pata_parport
|
||||
subsystem to try them all for you.
|
||||
|
||||
For the "brand-name" products listed above, here are the protocol
|
||||
and high-level drivers that you would use:
|
||||
|
||||
================ ============ ====== ========
|
||||
Manufacturer Model Driver Protocol
|
||||
================ ============ ====== ========
|
||||
MicroSolutions CD-ROM pcd bpck
|
||||
MicroSolutions PD drive pf bpck
|
||||
MicroSolutions hard-drive pd bpck
|
||||
MicroSolutions 8000t tape pt bpck
|
||||
SyQuest EZ, SparQ pd epat
|
||||
Imation Superdisk pf epat
|
||||
Maxell Superdisk pf friq
|
||||
Avatar Shark pd epat
|
||||
FreeCom CD-ROM pcd frpw
|
||||
Hewlett-Packard 5GB Tape pt epat
|
||||
Hewlett-Packard 7200e (CD) pcd epat
|
||||
Hewlett-Packard 7200e (CD-R) pg epat
|
||||
================ ============ ====== ========
|
||||
================ ============ ========
|
||||
Manufacturer Model Protocol
|
||||
================ ============ ========
|
||||
MicroSolutions CD-ROM bpck
|
||||
MicroSolutions PD drive bpck
|
||||
MicroSolutions hard-drive bpck
|
||||
MicroSolutions 8000t tape bpck
|
||||
SyQuest EZ, SparQ epat
|
||||
Imation Superdisk epat
|
||||
Maxell Superdisk friq
|
||||
Avatar Shark epat
|
||||
FreeCom CD-ROM frpw
|
||||
Hewlett-Packard 5GB Tape epat
|
||||
Hewlett-Packard 7200e (CD) epat
|
||||
Hewlett-Packard 7200e (CD-R) epat
|
||||
================ ============ ========
|
||||
|
||||
2.1 Configuring built-in drivers
|
||||
---------------------------------
|
||||
All parports and all protocol drivers are probed automatically unless probe=0
|
||||
parameter is used. So just "modprobe epat" is enough for a Imation SuperDisk
|
||||
drive to work.
|
||||
|
||||
We recommend that you get to know how the drivers work and how to
|
||||
configure them as loadable modules, before attempting to compile a
|
||||
kernel with the drivers built-in.
|
||||
Manual device creation::
|
||||
|
||||
If you built all of your PARIDE support directly into your kernel,
|
||||
and you have just a single parallel port IDE device, your kernel should
|
||||
locate it automatically for you. If you have more than one device,
|
||||
you may need to give some command line options to your bootloader
|
||||
(eg: LILO), how to do that is beyond the scope of this document.
|
||||
# echo "port protocol mode unit delay" >/sys/bus/pata_parport/new_device
|
||||
|
||||
The high-level drivers accept a number of command line parameters, all
|
||||
of which are documented in the source files in linux/drivers/block/paride.
|
||||
By default, each driver will automatically try all parallel ports it
|
||||
can find, and all protocol types that have been installed, until it finds
|
||||
a parallel port IDE adapter. Once it finds one, the probe stops. So,
|
||||
if you have more than one device, you will need to tell the drivers
|
||||
how to identify them. This requires specifying the port address, the
|
||||
protocol identification number and, for some devices, the drive's
|
||||
chain ID. While your system is booting, a number of messages are
|
||||
displayed on the console. Like all such messages, they can be
|
||||
reviewed with the 'dmesg' command. Among those messages will be
|
||||
some lines like::
|
||||
where:
|
||||
|
||||
paride: bpck registered as protocol 0
|
||||
paride: epat registered as protocol 1
|
||||
|
||||
The numbers will always be the same until you build a new kernel with
|
||||
different protocol selections. You should note these numbers as you
|
||||
will need them to identify the devices.
|
||||
======== ================================================
|
||||
port parport name (or "auto" for all parports)
|
||||
protocol protocol name (or "auto" for all protocols)
|
||||
mode mode number (protocol-specific) or -1 for probe
|
||||
unit unit number (for backpack only, see below)
|
||||
delay I/O delay (see troubleshooting section below)
|
||||
======== ================================================
|
||||
|
||||
If you happen to be using a MicroSolutions backpack device, you will
|
||||
also need to know the unit ID number for each drive. This is usually
|
||||
the last two digits of the drive's serial number (but read MicroSolutions'
|
||||
documentation about this).
|
||||
|
||||
As an example, let's assume that you have a MicroSolutions PD/CD drive
|
||||
with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
|
||||
EZ-135 connected to the chained port on the PD/CD drive and also an
|
||||
Imation Superdisk connected to port 0x278. You could give the following
|
||||
options on your boot command::
|
||||
If you omit the parameters from the end, defaults will be used, e.g.:
|
||||
|
||||
pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
|
||||
Probe all parports with all protocols::
|
||||
|
||||
In the last option, pf.drive1 configures device /dev/pf1, the 0x378
|
||||
is the parallel port base address, the 0 is the protocol registration
|
||||
number and 36 is the chain ID.
|
||||
# echo auto >/sys/bus/pata_parport/new_device
|
||||
|
||||
Please note: while PARIDE will work both with and without the
|
||||
PARPORT parallel port sharing system that is included by the
|
||||
"Parallel port support" option, PARPORT must be included and enabled
|
||||
if you want to use chains of devices on the same parallel port.
|
||||
Probe parport0 using protocol epat and mode 4 (EPP-16)::
|
||||
|
||||
2.2 Loading and configuring PARIDE as modules
|
||||
----------------------------------------------
|
||||
# echo "parport0 epat 4" >/sys/bus/pata_parport/new_device
|
||||
|
||||
It is much faster and simpler to get to understand the PARIDE drivers
|
||||
if you use them as loadable kernel modules.
|
||||
Probe parport0 using all protocols::
|
||||
|
||||
Note 1:
|
||||
using these drivers with the "kerneld" automatic module loading
|
||||
system is not recommended for beginners, and is not documented here.
|
||||
# echo "parport0 auto" >/sys/bus/pata_parport/new_device
|
||||
|
||||
Note 2:
|
||||
if you build PARPORT support as a loadable module, PARIDE must
|
||||
also be built as loadable modules, and PARPORT must be loaded before
|
||||
the PARIDE modules.
|
||||
Probe all parports using protoocol epat::
|
||||
|
||||
To use PARIDE, you must begin by::
|
||||
# echo "auto epat" >/sys/bus/pata_parport/new_device
|
||||
|
||||
insmod paride
|
||||
Deleting devices::
|
||||
|
||||
this loads a base module which provides a registry for the protocols,
|
||||
among other tasks.
|
||||
|
||||
Then, load as many of the protocol modules as you think you might need.
|
||||
As you load each module, it will register the protocols that it supports,
|
||||
and print a log message to your kernel log file and your console. For
|
||||
example::
|
||||
|
||||
# insmod epat
|
||||
paride: epat registered as protocol 0
|
||||
# insmod kbic
|
||||
paride: k951 registered as protocol 1
|
||||
paride: k971 registered as protocol 2
|
||||
|
||||
Finally, you can load high-level drivers for each kind of device that
|
||||
you have connected. By default, each driver will autoprobe for a single
|
||||
device, but you can support up to four similar devices by giving their
|
||||
individual coordinates when you load the driver.
|
||||
|
||||
For example, if you had two no-name CD-ROM drives both using the
|
||||
KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
|
||||
you could give the following command::
|
||||
|
||||
# insmod pcd drive0=0x378,1 drive1=0x3bc,1
|
||||
|
||||
For most adapters, giving a port address and protocol number is sufficient,
|
||||
but check the source files in linux/drivers/block/paride for more
|
||||
information. (Hopefully someone will write some man pages one day !).
|
||||
|
||||
As another example, here's what happens when PARPORT is installed, and
|
||||
a SyQuest EZ-135 is attached to port 0x378::
|
||||
|
||||
# insmod paride
|
||||
paride: version 1.0 installed
|
||||
# insmod epat
|
||||
paride: epat registered as protocol 0
|
||||
# insmod pd
|
||||
pd: pd version 1.0, major 45, cluster 64, nice 0
|
||||
pda: Sharing parport1 at 0x378
|
||||
pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
|
||||
pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
|
||||
pda: pda1
|
||||
|
||||
Note that the last line is the output from the generic partition table
|
||||
scanner - in this case it reports that it has found a disk with one partition.
|
||||
|
||||
2.3 Using a PARIDE device
|
||||
--------------------------
|
||||
|
||||
Once the drivers have been loaded, you can access PARIDE devices in the
|
||||
same way as their traditional counterparts. You will probably need to
|
||||
create the device "special files". Here is a simple script that you can
|
||||
cut to a file and execute::
|
||||
|
||||
#!/bin/bash
|
||||
#
|
||||
# mkd -- a script to create the device special files for the PARIDE subsystem
|
||||
#
|
||||
function mkdev {
|
||||
mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
|
||||
}
|
||||
#
|
||||
function pd {
|
||||
D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
|
||||
mkdev pd$D b 45 $[ $1 * 16 ]
|
||||
for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||
do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
|
||||
done
|
||||
}
|
||||
#
|
||||
cd /dev
|
||||
#
|
||||
for u in 0 1 2 3 ; do pd $u ; done
|
||||
for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
|
||||
for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
|
||||
for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
|
||||
for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
|
||||
for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
|
||||
#
|
||||
# end of mkd
|
||||
|
||||
With the device files and drivers in place, you can access PARIDE devices
|
||||
like any other Linux device. For example, to mount a CD-ROM in pcd0, use::
|
||||
|
||||
mount /dev/pcd0 /cdrom
|
||||
|
||||
If you have a fresh Avatar Shark cartridge, and the drive is pda, you
|
||||
might do something like::
|
||||
|
||||
fdisk /dev/pda -- make a new partition table with
|
||||
partition 1 of type 83
|
||||
|
||||
mke2fs /dev/pda1 -- to build the file system
|
||||
|
||||
mkdir /shark -- make a place to mount the disk
|
||||
|
||||
mount /dev/pda1 /shark
|
||||
|
||||
Devices like the Imation superdisk work in the same way, except that
|
||||
they do not have a partition table. For example to make a 120MB
|
||||
floppy that you could share with a DOS system::
|
||||
|
||||
mkdosfs /dev/pf0
|
||||
mount /dev/pf0 /mnt
|
||||
|
||||
|
||||
2.4 The pf driver
|
||||
------------------
|
||||
|
||||
The pf driver is intended for use with parallel port ATAPI disk
|
||||
devices. The most common devices in this category are PD drives
|
||||
and LS-120 drives. Traditionally, media for these devices are not
|
||||
partitioned. Consequently, the pf driver does not support partitioned
|
||||
media. This may be changed in a future version of the driver.
|
||||
|
||||
2.5 Using the pt driver
|
||||
------------------------
|
||||
|
||||
The pt driver for parallel port ATAPI tape drives is a minimal driver.
|
||||
It does not yet support many of the standard tape ioctl operations.
|
||||
For best performance, a block size of 32KB should be used. You will
|
||||
probably want to set the parallel port delay to 0, if you can.
|
||||
|
||||
2.6 Using the pg driver
|
||||
------------------------
|
||||
|
||||
The pg driver can be used in conjunction with the cdrecord program
|
||||
to create CD-ROMs. Please get cdrecord version 1.6.1 or later
|
||||
from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media
|
||||
your parallel port should ideally be set to EPP mode, and the "port delay"
|
||||
should be set to 0. With those settings it is possible to record at 2x
|
||||
speed without any buffer underruns. If you cannot get the driver to work
|
||||
in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
|
||||
# echo pata_parport.0 >/sys/bus/pata_parport/delete_device
|
||||
|
||||
|
||||
3. Troubleshooting
|
||||
@@ -344,9 +169,9 @@ in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
|
||||
3.1 Use EPP mode if you can
|
||||
----------------------------
|
||||
|
||||
The most common problems that people report with the PARIDE drivers
|
||||
The most common problems that people report with the pata_parport drivers
|
||||
concern the parallel port CMOS settings. At this time, none of the
|
||||
PARIDE protocol modules support ECP mode, or any ECP combination modes.
|
||||
protocol modules support ECP mode, or any ECP combination modes.
|
||||
If you are able to do so, please set your parallel port into EPP mode
|
||||
using your CMOS setup procedure.
|
||||
|
||||
@@ -354,17 +179,14 @@ using your CMOS setup procedure.
|
||||
-------------------------
|
||||
|
||||
Some parallel ports cannot reliably transfer data at full speed. To
|
||||
offset the errors, the PARIDE protocol modules introduce a "port
|
||||
offset the errors, the protocol modules introduce a "port
|
||||
delay" between each access to the i/o ports. Each protocol sets
|
||||
a default value for this delay. In most cases, the user can override
|
||||
the default and set it to 0 - resulting in somewhat higher transfer
|
||||
rates. In some rare cases (especially with older 486 systems) the
|
||||
default delays are not long enough. if you experience corrupt data
|
||||
transfers, or unexpected failures, you may wish to increase the
|
||||
port delay. The delay can be programmed using the "driveN" parameters
|
||||
to each of the high-level drivers. Please see the notes above, or
|
||||
read the comments at the beginning of the driver source files in
|
||||
linux/drivers/block/paride.
|
||||
port delay.
|
||||
|
||||
3.3 Some drives need a printer reset
|
||||
-------------------------------------
|
||||
@@ -374,66 +196,12 @@ that do not always power up correctly. We have noticed this with some
|
||||
drives based on OnSpec and older Freecom adapters. In these rare cases,
|
||||
the adapter can often be reinitialised by issuing a "printer reset" on
|
||||
the parallel port. As the reset operation is potentially disruptive in
|
||||
multiple device environments, the PARIDE drivers will not do it
|
||||
multiple device environments, the pata_parport drivers will not do it
|
||||
automatically. You can however, force a printer reset by doing::
|
||||
|
||||
insmod lp reset=1
|
||||
rmmod lp
|
||||
|
||||
If you have one of these marginal cases, you should probably build
|
||||
your paride drivers as modules, and arrange to do the printer reset
|
||||
before loading the PARIDE drivers.
|
||||
|
||||
3.4 Use the verbose option and dmesg if you need help
|
||||
------------------------------------------------------
|
||||
|
||||
While a lot of testing has gone into these drivers to make them work
|
||||
as smoothly as possible, problems will arise. If you do have problems,
|
||||
please check all the obvious things first: does the drive work in
|
||||
DOS with the manufacturer's drivers ? If that doesn't yield any useful
|
||||
clues, then please make sure that only one drive is hooked to your system,
|
||||
and that either (a) PARPORT is enabled or (b) no other device driver
|
||||
is using your parallel port (check in /proc/ioports). Then, load the
|
||||
appropriate drivers (you can load several protocol modules if you want)
|
||||
as in::
|
||||
|
||||
# insmod paride
|
||||
# insmod epat
|
||||
# insmod bpck
|
||||
# insmod kbic
|
||||
...
|
||||
# insmod pd verbose=1
|
||||
|
||||
(using the correct driver for the type of device you have, of course).
|
||||
The verbose=1 parameter will cause the drivers to log a trace of their
|
||||
activity as they attempt to locate your drive.
|
||||
|
||||
Use 'dmesg' to capture a log of all the PARIDE messages (any messages
|
||||
beginning with paride:, a protocol module's name or a driver's name) and
|
||||
include that with your bug report. You can submit a bug report in one
|
||||
of two ways. Either send it directly to the author of the PARIDE suite,
|
||||
by e-mail to grant@torque.net, or join the linux-parport mailing list
|
||||
and post your report there.
|
||||
|
||||
3.5 For more information or help
|
||||
---------------------------------
|
||||
|
||||
You can join the linux-parport mailing list by sending a mail message
|
||||
to:
|
||||
|
||||
linux-parport-request@torque.net
|
||||
|
||||
with the single word::
|
||||
|
||||
subscribe
|
||||
|
||||
in the body of the mail message (not in the subject line). Please be
|
||||
sure that your mail program is correctly set up when you do this, as
|
||||
the list manager is a robot that will subscribe you using the reply
|
||||
address in your mail headers. REMOVE any anti-spam gimmicks you may
|
||||
have in your mail headers, when sending mail to the list server.
|
||||
|
||||
You might also find some useful information on the linux-parport
|
||||
web pages (although they are not always up to date) at
|
||||
|
||||
http://web.archive.org/web/%2E/http://www.torque.net/parport/
|
||||
your pata_parport drivers as modules, and arrange to do the printer reset
|
||||
before loading the pata_parport drivers.
|
||||
|
@@ -201,6 +201,8 @@ To remove the config from the image, you can use -d option as below::
|
||||
|
||||
Then add "bootconfig" on the normal kernel command line to tell the
|
||||
kernel to look for the bootconfig at the end of the initrd file.
|
||||
Alternatively, build your kernel with the ``CONFIG_BOOT_CONFIG_FORCE``
|
||||
Kconfig option selected.
|
||||
|
||||
Embedding a Boot Config into Kernel
|
||||
-----------------------------------
|
||||
@@ -217,7 +219,9 @@ path to the bootconfig file from source tree or object tree.
|
||||
The kernel will embed it as the default bootconfig.
|
||||
|
||||
Just as when attaching the bootconfig to the initrd, you need ``bootconfig``
|
||||
option on the kernel command line to enable the embedded bootconfig.
|
||||
option on the kernel command line to enable the embedded bootconfig, or,
|
||||
alternatively, build your kernel with the ``CONFIG_BOOT_CONFIG_FORCE``
|
||||
Kconfig option selected.
|
||||
|
||||
Note that even if you set this option, you can override the embedded
|
||||
bootconfig by another bootconfig which attached to the initrd.
|
||||
|
@@ -106,7 +106,7 @@ Proportional weight policy files
|
||||
see Documentation/block/bfq-iosched.rst.
|
||||
|
||||
blkio.bfq.weight_device
|
||||
Specifes per cgroup per device weights, overriding the default group
|
||||
Specifies per cgroup per device weights, overriding the default group
|
||||
weight. For more details, see Documentation/block/bfq-iosched.rst.
|
||||
|
||||
Following is the format::
|
||||
|
@@ -80,6 +80,8 @@ access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rs
|
||||
you to associate a set of CPUs and a set of memory nodes with the
|
||||
tasks in each cgroup.
|
||||
|
||||
.. _cgroups-why-needed:
|
||||
|
||||
1.2 Why are cgroups needed ?
|
||||
----------------------------
|
||||
|
||||
|
@@ -2,18 +2,18 @@
|
||||
Memory Resource Controller
|
||||
==========================
|
||||
|
||||
NOTE:
|
||||
.. caution::
|
||||
This document is hopelessly outdated and it asks for a complete
|
||||
rewrite. It still contains a useful information so we are keeping it
|
||||
here but make sure to check the current code if you need a deeper
|
||||
understanding.
|
||||
|
||||
NOTE:
|
||||
.. note::
|
||||
The Memory Resource Controller has generically been referred to as the
|
||||
memory controller in this document. Do not confuse memory controller
|
||||
used here with the memory controller that is used in hardware.
|
||||
|
||||
(For editors) In this document:
|
||||
.. hint::
|
||||
When we mention a cgroup (cgroupfs's directory) with memory controller,
|
||||
we call it "memory cgroup". When you see git-log and source code, you'll
|
||||
see patch's title and function names tend to use "memcg".
|
||||
@@ -23,7 +23,7 @@ Benefits and Purpose of the memory controller
|
||||
=============================================
|
||||
|
||||
The memory controller isolates the memory behaviour of a group of tasks
|
||||
from the rest of the system. The article on LWN [12] mentions some probable
|
||||
from the rest of the system. The article on LWN [12]_ mentions some probable
|
||||
uses of the memory controller. The memory controller can be used to
|
||||
|
||||
a. Isolate an application or a group of applications
|
||||
@@ -55,7 +55,8 @@ Features:
|
||||
- Root cgroup has no limit controls.
|
||||
|
||||
Kernel memory support is a work in progress, and the current version provides
|
||||
basically functionality. (See Section 2.7)
|
||||
basically functionality. (See :ref:`section 2.7
|
||||
<cgroup-v1-memory-kernel-extension>`)
|
||||
|
||||
Brief summary of control files.
|
||||
|
||||
@@ -86,6 +87,8 @@ Brief summary of control files.
|
||||
memory.swappiness set/show swappiness parameter of vmscan
|
||||
(See sysctl's vm.swappiness)
|
||||
memory.move_charge_at_immigrate set/show controls of moving charges
|
||||
This knob is deprecated and shouldn't be
|
||||
used.
|
||||
memory.oom_control set/show oom controls.
|
||||
memory.numa_stat show the number of memory usage per numa
|
||||
node
|
||||
@@ -107,16 +110,16 @@ Brief summary of control files.
|
||||
==========
|
||||
|
||||
The memory controller has a long history. A request for comments for the memory
|
||||
controller was posted by Balbir Singh [1]. At the time the RFC was posted
|
||||
controller was posted by Balbir Singh [1]_. At the time the RFC was posted
|
||||
there were several implementations for memory control. The goal of the
|
||||
RFC was to build consensus and agreement for the minimal features required
|
||||
for memory control. The first RSS controller was posted by Balbir Singh[2]
|
||||
in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
|
||||
RSS controller. At OLS, at the resource management BoF, everyone suggested
|
||||
that we handle both page cache and RSS together. Another request was raised
|
||||
to allow user space handling of OOM. The current memory controller is
|
||||
for memory control. The first RSS controller was posted by Balbir Singh [2]_
|
||||
in Feb 2007. Pavel Emelianov [3]_ [4]_ [5]_ has since posted three versions
|
||||
of the RSS controller. At OLS, at the resource management BoF, everyone
|
||||
suggested that we handle both page cache and RSS together. Another request was
|
||||
raised to allow user space handling of OOM. The current memory controller is
|
||||
at version 6; it combines both mapped (RSS) and unmapped Page
|
||||
Cache Control [11].
|
||||
Cache Control [11]_.
|
||||
|
||||
2. Memory Control
|
||||
=================
|
||||
@@ -147,7 +150,8 @@ specific data structure (mem_cgroup) associated with it.
|
||||
2.2. Accounting
|
||||
---------------
|
||||
|
||||
::
|
||||
.. code-block::
|
||||
:caption: Figure 1: Hierarchy of Accounting
|
||||
|
||||
+--------------------+
|
||||
| mem_cgroup |
|
||||
@@ -167,7 +171,6 @@ specific data structure (mem_cgroup) associated with it.
|
||||
| | | |
|
||||
+---------------+ +---------------+
|
||||
|
||||
(Figure 1: Hierarchy of Accounting)
|
||||
|
||||
|
||||
Figure 1 shows the important aspects of the controller
|
||||
@@ -221,8 +224,9 @@ behind this approach is that a cgroup that aggressively uses a shared
|
||||
page will eventually get charged for it (once it is uncharged from
|
||||
the cgroup that brought it in -- this will happen on memory pressure).
|
||||
|
||||
But see section 8.2: when moving a task to another cgroup, its pages may
|
||||
be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
|
||||
But see :ref:`section 8.2 <cgroup-v1-memory-movable-charges>` when moving a
|
||||
task to another cgroup, its pages may be recharged to the new cgroup, if
|
||||
move_charge_at_immigrate has been chosen.
|
||||
|
||||
2.4 Swap Extension
|
||||
--------------------------------------
|
||||
@@ -244,7 +248,8 @@ In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
|
||||
By using the memsw limit, you can avoid system OOM which can be caused by swap
|
||||
shortage.
|
||||
|
||||
**why 'memory+swap' rather than swap**
|
||||
2.4.1 why 'memory+swap' rather than swap
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
|
||||
to move account from memory to swap...there is no change in usage of
|
||||
@@ -252,7 +257,8 @@ memory+swap. In other words, when we want to limit the usage of swap without
|
||||
affecting global LRU, memory+swap limit is better than just limiting swap from
|
||||
an OS point of view.
|
||||
|
||||
**What happens when a cgroup hits memory.memsw.limit_in_bytes**
|
||||
2.4.2. What happens when a cgroup hits memory.memsw.limit_in_bytes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
|
||||
in this cgroup. Then, swap-out will not be done by cgroup routine and file
|
||||
@@ -268,26 +274,26 @@ global VM. When a cgroup goes over its limit, we first try
|
||||
to reclaim memory from the cgroup so as to make space for the new
|
||||
pages that the cgroup has touched. If the reclaim is unsuccessful,
|
||||
an OOM routine is invoked to select and kill the bulkiest task in the
|
||||
cgroup. (See 10. OOM Control below.)
|
||||
cgroup. (See :ref:`10. OOM Control <cgroup-v1-memory-oom-control>` below.)
|
||||
|
||||
The reclaim algorithm has not been modified for cgroups, except that
|
||||
pages that are selected for reclaiming come from the per-cgroup LRU
|
||||
list.
|
||||
|
||||
NOTE:
|
||||
Reclaim does not work for the root cgroup, since we cannot set any
|
||||
limits on the root cgroup.
|
||||
.. note::
|
||||
Reclaim does not work for the root cgroup, since we cannot set any
|
||||
limits on the root cgroup.
|
||||
|
||||
Note2:
|
||||
When panic_on_oom is set to "2", the whole system will panic.
|
||||
.. note::
|
||||
When panic_on_oom is set to "2", the whole system will panic.
|
||||
|
||||
When oom event notifier is registered, event will be delivered.
|
||||
(See oom_control section)
|
||||
(See :ref:`oom_control <cgroup-v1-memory-oom-control>` section)
|
||||
|
||||
2.6 Locking
|
||||
-----------
|
||||
|
||||
Lock order is as follows:
|
||||
Lock order is as follows::
|
||||
|
||||
Page lock (PG_locked bit of page->flags)
|
||||
mm->page_table_lock or split pte_lock
|
||||
@@ -299,6 +305,8 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
|
||||
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
||||
isolating a page from its LRU under lruvec->lru_lock.
|
||||
|
||||
.. _cgroup-v1-memory-kernel-extension:
|
||||
|
||||
2.7 Kernel Memory Extension
|
||||
-----------------------------------------------
|
||||
|
||||
@@ -367,10 +375,10 @@ U != 0, K < U:
|
||||
never greater than the total memory, and freely set U at the cost of his
|
||||
QoS.
|
||||
|
||||
WARNING:
|
||||
In the current implementation, memory reclaim will NOT be
|
||||
triggered for a cgroup when it hits K while staying below U, which makes
|
||||
this setup impractical.
|
||||
.. warning::
|
||||
In the current implementation, memory reclaim will NOT be triggered for
|
||||
a cgroup when it hits K while staying below U, which makes this setup
|
||||
impractical.
|
||||
|
||||
U != 0, K >= U:
|
||||
Since kmem charges will also be fed to the user counter and reclaim will be
|
||||
@@ -381,45 +389,41 @@ U != 0, K >= U:
|
||||
3. User Interface
|
||||
=================
|
||||
|
||||
3.0. Configuration
|
||||
------------------
|
||||
To use the user interface:
|
||||
|
||||
a. Enable CONFIG_CGROUPS
|
||||
b. Enable CONFIG_MEMCG
|
||||
|
||||
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
||||
-------------------------------------------------------------------
|
||||
|
||||
::
|
||||
1. Enable CONFIG_CGROUPS and CONFIG_MEMCG options
|
||||
2. Prepare the cgroups (see :ref:`Why are cgroups needed?
|
||||
<cgroups-why-needed>` for the background information)::
|
||||
|
||||
# mount -t tmpfs none /sys/fs/cgroup
|
||||
# mkdir /sys/fs/cgroup/memory
|
||||
# mount -t cgroup none /sys/fs/cgroup/memory -o memory
|
||||
|
||||
3.2. Make the new group and move bash into it::
|
||||
3. Make the new group and move bash into it::
|
||||
|
||||
# mkdir /sys/fs/cgroup/memory/0
|
||||
# echo $$ > /sys/fs/cgroup/memory/0/tasks
|
||||
|
||||
Since now we're in the 0 cgroup, we can alter the memory limit::
|
||||
4. Since now we're in the 0 cgroup, we can alter the memory limit::
|
||||
|
||||
# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
|
||||
|
||||
NOTE:
|
||||
We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
|
||||
mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
|
||||
Gibibytes.)
|
||||
The limit can now be queried::
|
||||
|
||||
NOTE:
|
||||
We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
|
||||
# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
|
||||
4194304
|
||||
|
||||
NOTE:
|
||||
We cannot set limits on the root cgroup any more.
|
||||
.. note::
|
||||
We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
|
||||
mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
|
||||
Gibibytes.)
|
||||
|
||||
::
|
||||
.. note::
|
||||
We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
|
||||
|
||||
.. note::
|
||||
We cannot set limits on the root cgroup any more.
|
||||
|
||||
# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
|
||||
4194304
|
||||
|
||||
We can check the usage::
|
||||
|
||||
@@ -458,6 +462,8 @@ test because it has noise of shared objects/status.
|
||||
But the above two are testing extreme situations.
|
||||
Trying usual test under memory controller is always helpful.
|
||||
|
||||
.. _cgroup-v1-memory-test-troubleshoot:
|
||||
|
||||
4.1 Troubleshooting
|
||||
-------------------
|
||||
|
||||
@@ -470,8 +476,11 @@ terminated by the OOM killer. There are several causes for this:
|
||||
A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
|
||||
some of the pages cached in the cgroup (page cache pages).
|
||||
|
||||
To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
|
||||
seeing what happens will be helpful.
|
||||
To know what happens, disabling OOM_Kill as per :ref:`"10. OOM Control"
|
||||
<cgroup-v1-memory-oom-control>` (below) and seeing what happens will be
|
||||
helpful.
|
||||
|
||||
.. _cgroup-v1-memory-test-task-migration:
|
||||
|
||||
4.2 Task migration
|
||||
------------------
|
||||
@@ -482,15 +491,16 @@ remain charged to it, the charge is dropped when the page is freed or
|
||||
reclaimed.
|
||||
|
||||
You can move charges of a task along with task migration.
|
||||
See 8. "Move charges at task migration"
|
||||
See :ref:`8. "Move charges at task migration" <cgroup-v1-memory-move-charges>`
|
||||
|
||||
4.3 Removing a cgroup
|
||||
---------------------
|
||||
|
||||
A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
|
||||
cgroup might have some charge associated with it, even though all
|
||||
tasks have migrated away from it. (because we charge against pages, not
|
||||
against tasks.)
|
||||
A cgroup can be removed by rmdir, but as discussed in :ref:`sections 4.1
|
||||
<cgroup-v1-memory-test-troubleshoot>` and :ref:`4.2
|
||||
<cgroup-v1-memory-test-task-migration>`, a cgroup might have some charge
|
||||
associated with it, even though all tasks have migrated away from it. (because
|
||||
we charge against pages, not against tasks.)
|
||||
|
||||
We move the stats to parent, and no change on the charge except uncharging
|
||||
from the child.
|
||||
@@ -519,67 +529,66 @@ will be charged as a new owner of it.
|
||||
5.2 stat file
|
||||
-------------
|
||||
|
||||
memory.stat file includes following statistics
|
||||
memory.stat file includes following statistics:
|
||||
|
||||
per-memory cgroup local status
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
* per-memory cgroup local status
|
||||
|
||||
=============== ===============================================================
|
||||
cache # of bytes of page cache memory.
|
||||
rss # of bytes of anonymous and swap cache memory (includes
|
||||
transparent hugepages).
|
||||
rss_huge # of bytes of anonymous transparent hugepages.
|
||||
mapped_file # of bytes of mapped file (includes tmpfs/shmem)
|
||||
pgpgin # of charging events to the memory cgroup. The charging
|
||||
event happens each time a page is accounted as either mapped
|
||||
anon page(RSS) or cache page(Page Cache) to the cgroup.
|
||||
pgpgout # of uncharging events to the memory cgroup. The uncharging
|
||||
event happens each time a page is unaccounted from the cgroup.
|
||||
swap # of bytes of swap usage
|
||||
dirty # of bytes that are waiting to get written back to the disk.
|
||||
writeback # of bytes of file/anon cache that are queued for syncing to
|
||||
disk.
|
||||
inactive_anon # of bytes of anonymous and swap cache memory on inactive
|
||||
LRU list.
|
||||
active_anon # of bytes of anonymous and swap cache memory on active
|
||||
LRU list.
|
||||
inactive_file # of bytes of file-backed memory and MADV_FREE anonymous memory(
|
||||
LazyFree pages) on inactive LRU list.
|
||||
active_file # of bytes of file-backed memory on active LRU list.
|
||||
unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
|
||||
=============== ===============================================================
|
||||
=============== ===============================================================
|
||||
cache # of bytes of page cache memory.
|
||||
rss # of bytes of anonymous and swap cache memory (includes
|
||||
transparent hugepages).
|
||||
rss_huge # of bytes of anonymous transparent hugepages.
|
||||
mapped_file # of bytes of mapped file (includes tmpfs/shmem)
|
||||
pgpgin # of charging events to the memory cgroup. The charging
|
||||
event happens each time a page is accounted as either mapped
|
||||
anon page(RSS) or cache page(Page Cache) to the cgroup.
|
||||
pgpgout # of uncharging events to the memory cgroup. The uncharging
|
||||
event happens each time a page is unaccounted from the
|
||||
cgroup.
|
||||
swap # of bytes of swap usage
|
||||
dirty # of bytes that are waiting to get written back to the disk.
|
||||
writeback # of bytes of file/anon cache that are queued for syncing to
|
||||
disk.
|
||||
inactive_anon # of bytes of anonymous and swap cache memory on inactive
|
||||
LRU list.
|
||||
active_anon # of bytes of anonymous and swap cache memory on active
|
||||
LRU list.
|
||||
inactive_file # of bytes of file-backed memory and MADV_FREE anonymous
|
||||
memory (LazyFree pages) on inactive LRU list.
|
||||
active_file # of bytes of file-backed memory on active LRU list.
|
||||
unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
|
||||
=============== ===============================================================
|
||||
|
||||
status considering hierarchy (see memory.use_hierarchy settings)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
* status considering hierarchy (see memory.use_hierarchy settings):
|
||||
|
||||
========================= ===================================================
|
||||
hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
|
||||
under which the memory cgroup is
|
||||
hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
|
||||
hierarchy under which memory cgroup is.
|
||||
========================= ===================================================
|
||||
hierarchical_memory_limit # of bytes of memory limit with regard to
|
||||
hierarchy
|
||||
under which the memory cgroup is
|
||||
hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
|
||||
hierarchy under which memory cgroup is.
|
||||
|
||||
total_<counter> # hierarchical version of <counter>, which in
|
||||
addition to the cgroup's own value includes the
|
||||
sum of all hierarchical children's values of
|
||||
<counter>, i.e. total_cache
|
||||
========================= ===================================================
|
||||
total_<counter> # hierarchical version of <counter>, which in
|
||||
addition to the cgroup's own value includes the
|
||||
sum of all hierarchical children's values of
|
||||
<counter>, i.e. total_cache
|
||||
========================= ===================================================
|
||||
|
||||
The following additional stats are dependent on CONFIG_DEBUG_VM
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
* additional vm parameters (depends on CONFIG_DEBUG_VM):
|
||||
|
||||
========================= ========================================
|
||||
recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
|
||||
recent_rotated_file VM internal parameter. (see mm/vmscan.c)
|
||||
recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
|
||||
recent_scanned_file VM internal parameter. (see mm/vmscan.c)
|
||||
========================= ========================================
|
||||
========================= ========================================
|
||||
recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
|
||||
recent_rotated_file VM internal parameter. (see mm/vmscan.c)
|
||||
recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
|
||||
recent_scanned_file VM internal parameter. (see mm/vmscan.c)
|
||||
========================= ========================================
|
||||
|
||||
Memo:
|
||||
.. hint::
|
||||
recent_rotated means recent frequency of LRU rotation.
|
||||
recent_scanned means recent # of scans to LRU.
|
||||
showing for better debug please see the code for meanings.
|
||||
|
||||
Note:
|
||||
.. note::
|
||||
Only anonymous and swap cache memory is listed as part of 'rss' stat.
|
||||
This should not be confused with the true 'resident set size' or the
|
||||
amount of physical memory used by the cgroup.
|
||||
@@ -710,15 +719,25 @@ If we want to change this to 1G, we can at any time use::
|
||||
|
||||
# echo 1G > memory.soft_limit_in_bytes
|
||||
|
||||
NOTE1:
|
||||
.. note::
|
||||
Soft limits take effect over a long period of time, since they involve
|
||||
reclaiming memory for balancing between memory cgroups
|
||||
NOTE2:
|
||||
|
||||
.. note::
|
||||
It is recommended to set the soft limit always below the hard limit,
|
||||
otherwise the hard limit will take precedence.
|
||||
|
||||
8. Move charges at task migration
|
||||
=================================
|
||||
.. _cgroup-v1-memory-move-charges:
|
||||
|
||||
8. Move charges at task migration (DEPRECATED!)
|
||||
===============================================
|
||||
|
||||
THIS IS DEPRECATED!
|
||||
|
||||
It's expensive and unreliable! It's better practice to launch workload
|
||||
tasks directly from inside their target cgroup. Use dedicated workload
|
||||
cgroups to allow fine-grained policy adjustments without having to
|
||||
move physical pages between control domains.
|
||||
|
||||
Users can move charges associated with a task along with task migration, that
|
||||
is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
|
||||
@@ -735,23 +754,29 @@ If you want to enable it::
|
||||
|
||||
# echo (some positive value) > memory.move_charge_at_immigrate
|
||||
|
||||
Note:
|
||||
.. note::
|
||||
Each bits of move_charge_at_immigrate has its own meaning about what type
|
||||
of charges should be moved. See 8.2 for details.
|
||||
Note:
|
||||
of charges should be moved. See :ref:`section 8.2
|
||||
<cgroup-v1-memory-movable-charges>` for details.
|
||||
|
||||
.. note::
|
||||
Charges are moved only when you move mm->owner, in other words,
|
||||
a leader of a thread group.
|
||||
Note:
|
||||
|
||||
.. note::
|
||||
If we cannot find enough space for the task in the destination cgroup, we
|
||||
try to make space by reclaiming memory. Task migration may fail if we
|
||||
cannot make enough space.
|
||||
Note:
|
||||
|
||||
.. note::
|
||||
It can take several seconds if you move charges much.
|
||||
|
||||
And if you want disable it again::
|
||||
|
||||
# echo 0 > memory.move_charge_at_immigrate
|
||||
|
||||
.. _cgroup-v1-memory-movable-charges:
|
||||
|
||||
8.2 Type of charges which can be moved
|
||||
--------------------------------------
|
||||
|
||||
@@ -801,6 +826,8 @@ threshold in any direction.
|
||||
|
||||
It's applicable for root and non-root cgroup.
|
||||
|
||||
.. _cgroup-v1-memory-oom-control:
|
||||
|
||||
10. OOM Control
|
||||
===============
|
||||
|
||||
@@ -956,15 +983,16 @@ commented and discussed quite extensively in the community.
|
||||
References
|
||||
==========
|
||||
|
||||
1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
|
||||
2. Singh, Balbir. Memory Controller (RSS Control),
|
||||
.. [1] Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
|
||||
.. [2] Singh, Balbir. Memory Controller (RSS Control),
|
||||
http://lwn.net/Articles/222762/
|
||||
3. Emelianov, Pavel. Resource controllers based on process cgroups
|
||||
.. [3] Emelianov, Pavel. Resource controllers based on process cgroups
|
||||
https://lore.kernel.org/r/45ED7DEC.7010403@sw.ru
|
||||
4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
|
||||
.. [4] Emelianov, Pavel. RSS controller based on process cgroups (v2)
|
||||
https://lore.kernel.org/r/461A3010.90403@sw.ru
|
||||
5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
|
||||
.. [5] Emelianov, Pavel. RSS controller based on process cgroups (v3)
|
||||
https://lore.kernel.org/r/465D9739.8070209@openvz.org
|
||||
|
||||
6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
|
||||
7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
|
||||
subsystem (v3), http://lwn.net/Articles/235534/
|
||||
@@ -974,7 +1002,8 @@ References
|
||||
https://lore.kernel.org/r/464D267A.50107@linux.vnet.ibm.com
|
||||
10. Singh, Balbir. Memory controller v6 test results,
|
||||
https://lore.kernel.org/r/20070819094658.654.84837.sendpatchset@balbir-laptop
|
||||
11. Singh, Balbir. Memory controller introduction (v6),
|
||||
https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop
|
||||
12. Corbet, Jonathan, Controlling memory use in cgroups,
|
||||
http://lwn.net/Articles/243795/
|
||||
|
||||
.. [11] Singh, Balbir. Memory controller introduction (v6),
|
||||
https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop
|
||||
.. [12] Corbet, Jonathan, Controlling memory use in cgroups,
|
||||
http://lwn.net/Articles/243795/
|
||||
|
@@ -619,10 +619,12 @@ process migrations.
|
||||
and is an example of this type.
|
||||
|
||||
|
||||
.. _cgroupv2-limits-distributor:
|
||||
|
||||
Limits
|
||||
------
|
||||
|
||||
A child can only consume upto the configured amount of the resource.
|
||||
A child can only consume up to the configured amount of the resource.
|
||||
Limits can be over-committed - the sum of the limits of children can
|
||||
exceed the amount of resource available to the parent.
|
||||
|
||||
@@ -635,15 +637,16 @@ process migrations.
|
||||
"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
|
||||
on an IO device and is an example of this type.
|
||||
|
||||
.. _cgroupv2-protections-distributor:
|
||||
|
||||
Protections
|
||||
-----------
|
||||
|
||||
A cgroup is protected upto the configured amount of the resource
|
||||
A cgroup is protected up to the configured amount of the resource
|
||||
as long as the usages of all its ancestors are under their
|
||||
protected levels. Protections can be hard guarantees or best effort
|
||||
soft boundaries. Protections can also be over-committed in which case
|
||||
only upto the amount available to the parent is protected among
|
||||
only up to the amount available to the parent is protected among
|
||||
children.
|
||||
|
||||
Protections are in the range [0, max] and defaults to 0, which is
|
||||
@@ -1076,7 +1079,7 @@ All time durations are in microseconds.
|
||||
|
||||
$MAX $PERIOD
|
||||
|
||||
which indicates that the group may consume upto $MAX in each
|
||||
which indicates that the group may consume up to $MAX in each
|
||||
$PERIOD duration. "max" for $MAX indicates no limit. If only
|
||||
one number is written, $MAX is updated.
|
||||
|
||||
@@ -2286,7 +2289,7 @@ Cpuset Interface Files
|
||||
For a valid partition root with the sibling cpu exclusivity
|
||||
rule enabled, changes made to "cpuset.cpus" that violate the
|
||||
exclusivity rule will invalidate the partition as well as its
|
||||
sibiling partitions with conflicting cpuset.cpus values. So
|
||||
sibling partitions with conflicting cpuset.cpus values. So
|
||||
care must be taking in changing "cpuset.cpus".
|
||||
|
||||
A valid non-root parent partition may distribute out all its CPUs
|
||||
|
@@ -399,7 +399,7 @@ A partial list of the supported mount options follows:
|
||||
sep
|
||||
if first mount option (after the -o), overrides
|
||||
the comma as the separator between the mount
|
||||
parms. e.g.::
|
||||
parameters. e.g.::
|
||||
|
||||
-o user=myname,password=mypassword,domain=mydom
|
||||
|
||||
@@ -765,7 +765,7 @@ cifsFYI If set to non-zero value, additional debug information
|
||||
Some debugging statements are not compiled into the
|
||||
cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
|
||||
kernel configuration. cifsFYI may be set to one or
|
||||
nore of the following flags (7 sets them all)::
|
||||
more of the following flags (7 sets them all)::
|
||||
|
||||
+-----------------------------------------------+------+
|
||||
| log cifs informational messages | 0x01 |
|
||||
|
@@ -70,7 +70,7 @@ the entries (each hotspot block covers a larger area than a single
|
||||
cache block).
|
||||
|
||||
All this means smq uses ~25bytes per cache block. Still a lot of
|
||||
memory, but a substantial improvement nontheless.
|
||||
memory, but a substantial improvement nonetheless.
|
||||
|
||||
Level balancing
|
||||
^^^^^^^^^^^^^^^
|
||||
|
@@ -31,7 +31,7 @@ Mandatory parameters:
|
||||
|
||||
Optional parameter:
|
||||
|
||||
<underyling sectors>:
|
||||
<underlying sectors>:
|
||||
Number of sectors defining the logical block size of <dev path>.
|
||||
2^N supported, e.g. 8 = emulate 8 sectors of 512 bytes = 4KiB.
|
||||
If not provided, the logical block size of <dev path> will be used.
|
||||
|
@@ -46,7 +46,7 @@ just like conventional zones.
|
||||
The zones of the device(s) are separated into 2 types:
|
||||
|
||||
1) Metadata zones: these are conventional zones used to store metadata.
|
||||
Metadata zones are not reported as useable capacity to the user.
|
||||
Metadata zones are not reported as usable capacity to the user.
|
||||
|
||||
2) Data zones: all remaining zones, the vast majority of which will be
|
||||
sequential zones used exclusively to store user data. The conventional
|
||||
|
@@ -35,7 +35,7 @@ An example of undoing an existing dm-stripe
|
||||
|
||||
This small bash script will setup 4 loop devices and use the existing
|
||||
striped target to combine the 4 devices into one. It then will use
|
||||
the unstriped target ontop of the striped device to access the
|
||||
the unstriped target on top of the striped device to access the
|
||||
individual backing loop devices. We write data to the newly exposed
|
||||
unstriped devices and verify the data written matches the correct
|
||||
underlying device on the striped array::
|
||||
@@ -110,8 +110,8 @@ to get a 92% reduction in read latency using this device mapper target.
|
||||
Example dmsetup usage
|
||||
=====================
|
||||
|
||||
unstriped ontop of Intel NVMe device that has 2 cores
|
||||
-----------------------------------------------------
|
||||
unstriped on top of Intel NVMe device that has 2 cores
|
||||
------------------------------------------------------
|
||||
|
||||
::
|
||||
|
||||
@@ -124,8 +124,8 @@ respectively::
|
||||
/dev/mapper/nvmset0
|
||||
/dev/mapper/nvmset1
|
||||
|
||||
unstriped ontop of striped with 4 drives using 128K chunk size
|
||||
--------------------------------------------------------------
|
||||
unstriped on top of striped with 4 drives using 128K chunk size
|
||||
---------------------------------------------------------------
|
||||
|
||||
::
|
||||
|
||||
|
@@ -330,7 +330,7 @@ Examples
|
||||
|
||||
// boot-args example, with newlines and comments for readability
|
||||
Kernel command line: ...
|
||||
// see whats going on in dyndbg=value processing
|
||||
// see what's going on in dyndbg=value processing
|
||||
dynamic_debug.verbose=3
|
||||
// enable pr_debugs in the btrfs module (can be builtin or loadable)
|
||||
btrfs.dyndbg="+p"
|
||||
|
@@ -123,7 +123,7 @@ Each simulated GPIO chip creates a separate sysfs group under its device
|
||||
directory for each exposed line
|
||||
(e.g. ``/sys/devices/platform/gpio-sim.X/gpiochipY/``). The name of each group
|
||||
is of the form: ``'sim_gpioX'`` where X is the offset of the line. Inside each
|
||||
group there are two attibutes:
|
||||
group there are two attributes:
|
||||
|
||||
``pull`` - allows to read and set the current simulated pull setting for
|
||||
every line, when writing the value must be one of: ``'pull-up'``,
|
||||
|
91
Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst
Normal file
91
Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst
Normal file
@@ -0,0 +1,91 @@
|
||||
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Cross-Thread Return Address Predictions
|
||||
=======================================
|
||||
|
||||
Certain AMD and Hygon processors are subject to a cross-thread return address
|
||||
predictions vulnerability. When running in SMT mode and one sibling thread
|
||||
transitions out of C0 state, the other sibling thread could use return target
|
||||
predictions from the sibling thread that transitioned out of C0.
|
||||
|
||||
The Spectre v2 mitigations protect the Linux kernel, as it fills the return
|
||||
address prediction entries with safe targets when context switching to the idle
|
||||
thread. However, KVM does allow a VMM to prevent exiting guest mode when
|
||||
transitioning out of C0. This could result in a guest-controlled return target
|
||||
being consumed by the sibling thread.
|
||||
|
||||
Affected processors
|
||||
-------------------
|
||||
|
||||
The following CPUs are vulnerable:
|
||||
|
||||
- AMD Family 17h processors
|
||||
- Hygon Family 18h processors
|
||||
|
||||
Related CVEs
|
||||
------------
|
||||
|
||||
The following CVE entry is related to this issue:
|
||||
|
||||
============== =======================================
|
||||
CVE-2022-27672 Cross-Thread Return Address Predictions
|
||||
============== =======================================
|
||||
|
||||
Problem
|
||||
-------
|
||||
|
||||
Affected SMT-capable processors support 1T and 2T modes of execution when SMT
|
||||
is enabled. In 2T mode, both threads in a core are executing code. For the
|
||||
processor core to enter 1T mode, it is required that one of the threads
|
||||
requests to transition out of the C0 state. This can be communicated with the
|
||||
HLT instruction or with an MWAIT instruction that requests non-C0.
|
||||
When the thread re-enters the C0 state, the processor transitions back
|
||||
to 2T mode, assuming the other thread is also still in C0 state.
|
||||
|
||||
In affected processors, the return address predictor (RAP) is partitioned
|
||||
depending on the SMT mode. For instance, in 2T mode each thread uses a private
|
||||
16-entry RAP, but in 1T mode, the active thread uses a 32-entry RAP. Upon
|
||||
transition between 1T/2T mode, the RAP contents are not modified but the RAP
|
||||
pointers (which control the next return target to use for predictions) may
|
||||
change. This behavior may result in return targets from one SMT thread being
|
||||
used by RET predictions in the sibling thread following a 1T/2T switch. In
|
||||
particular, a RET instruction executed immediately after a transition to 1T may
|
||||
use a return target from the thread that just became idle. In theory, this
|
||||
could lead to information disclosure if the return targets used do not come
|
||||
from trustworthy code.
|
||||
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
An attack can be mounted on affected processors by performing a series of CALL
|
||||
instructions with targeted return locations and then transitioning out of C0
|
||||
state.
|
||||
|
||||
Mitigation mechanism
|
||||
--------------------
|
||||
|
||||
Before entering idle state, the kernel context switches to the idle thread. The
|
||||
context switch fills the RAP entries (referred to as the RSB in Linux) with safe
|
||||
targets by performing a sequence of CALL instructions.
|
||||
|
||||
Prevent a guest VM from directly putting the processor into an idle state by
|
||||
intercepting HLT and MWAIT instructions.
|
||||
|
||||
Both mitigations are required to fully address this issue.
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
|
||||
Use existing Spectre v2 mitigations that will fill the RSB on context switch.
|
||||
|
||||
Mitigation control for KVM - module parameter
|
||||
---------------------------------------------
|
||||
|
||||
By default, the KVM hypervisor mitigates this issue by intercepting guest
|
||||
attempts to transition out of C0. A VMM can use the KVM_CAP_X86_DISABLE_EXITS
|
||||
capability to override those interceptions, but since this is not common, the
|
||||
mitigation that covers this path is not enabled by default.
|
||||
|
||||
The mitigation for the KVM_CAP_X86_DISABLE_EXITS capability can be turned on
|
||||
using the boolean module parameter mitigate_smt_rsb, e.g. ``kvm.mitigate_smt_rsb=1``.
|
@@ -18,3 +18,4 @@ are configurable at compile, boot or run time.
|
||||
core-scheduling.rst
|
||||
l1d_flush.rst
|
||||
processor_mmio_stale_data.rst
|
||||
cross-thread-rsb.rst
|
||||
|
@@ -64,8 +64,8 @@ architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
||||
Attack scenarios
|
||||
----------------
|
||||
|
||||
Attacks against the MDS vulnerabilities can be mounted from malicious non
|
||||
priviledged user space applications running on hosts or guest. Malicious
|
||||
Attacks against the MDS vulnerabilities can be mounted from malicious non-
|
||||
privileged user space applications running on hosts or guest. Malicious
|
||||
guest OSes can obviously mount attacks as well.
|
||||
|
||||
Contrary to other speculation based vulnerabilities the MDS vulnerability
|
||||
|
@@ -479,8 +479,16 @@ Spectre variant 2
|
||||
On Intel Skylake-era systems the mitigation covers most, but not all,
|
||||
cases. See :ref:`[3] <spec_ref3>` for more details.
|
||||
|
||||
On CPUs with hardware mitigation for Spectre variant 2 (e.g. Enhanced
|
||||
IBRS on x86), retpoline is automatically disabled at run time.
|
||||
On CPUs with hardware mitigation for Spectre variant 2 (e.g. IBRS
|
||||
or enhanced IBRS on x86), retpoline is automatically disabled at run time.
|
||||
|
||||
Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at
|
||||
boot, by setting the IBRS bit, and they're automatically protected against
|
||||
Spectre v2 variant attacks, including cross-thread branch target injections
|
||||
on SMT systems (STIBP). In other words, eIBRS enables STIBP too.
|
||||
|
||||
Legacy IBRS systems clear the IBRS bit on exit to userspace and
|
||||
therefore explicitly enable STIBP for that
|
||||
|
||||
The retpoline mitigation is turned on by default on vulnerable
|
||||
CPUs. It can be forced on or off by the administrator
|
||||
@@ -504,9 +512,12 @@ Spectre variant 2
|
||||
For Spectre variant 2 mitigation, individual user programs
|
||||
can be compiled with return trampolines for indirect branches.
|
||||
This protects them from consuming poisoned entries in the branch
|
||||
target buffer left by malicious software. Alternatively, the
|
||||
programs can disable their indirect branch speculation via prctl()
|
||||
(See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
|
||||
target buffer left by malicious software.
|
||||
|
||||
On legacy IBRS systems, at return to userspace, implicit STIBP is disabled
|
||||
because the kernel clears the IBRS bit. In this case, the userspace programs
|
||||
can disable indirect branch speculation via prctl() (See
|
||||
:ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
|
||||
On x86, this will turn on STIBP to guard against attacks from the
|
||||
sibling thread when the user program is running, and use IBPB to
|
||||
flush the branch target buffer when switching to/from the program.
|
||||
@@ -610,9 +621,9 @@ kernel command line.
|
||||
retpoline,generic Retpolines
|
||||
retpoline,lfence LFENCE; indirect branch
|
||||
retpoline,amd alias for retpoline,lfence
|
||||
eibrs enhanced IBRS
|
||||
eibrs,retpoline enhanced IBRS + Retpolines
|
||||
eibrs,lfence enhanced IBRS + LFENCE
|
||||
eibrs Enhanced/Auto IBRS
|
||||
eibrs,retpoline Enhanced/Auto IBRS + Retpolines
|
||||
eibrs,lfence Enhanced/Auto IBRS + LFENCE
|
||||
ibrs use IBRS to protect kernel
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
|
@@ -56,6 +56,17 @@ ABI will be found here.
|
||||
|
||||
sysfs-rules
|
||||
|
||||
This is the beginning of a section with information of interest to
|
||||
application developers and system integrators doing analysis of the
|
||||
Linux kernel for safety critical applications. Documents supporting
|
||||
analysis of kernel interactions with applications, and key kernel
|
||||
subsystems expectations will be found here.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
workload-tracing
|
||||
|
||||
The rest of this manual consists of various unordered guides on how to
|
||||
configure specific aspects of kernel behavior to your liking.
|
||||
|
||||
@@ -116,6 +127,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||
svga
|
||||
syscall-user-dispatch
|
||||
sysrq
|
||||
thermal/index
|
||||
thunderbolt
|
||||
ufs
|
||||
unicode
|
||||
|
@@ -312,10 +312,10 @@ define dmesg
|
||||
set var $prev_flags = $info->flags
|
||||
end
|
||||
|
||||
set var $id = ($id + 1) & $id_mask
|
||||
if ($id == $end_id)
|
||||
loop_break
|
||||
end
|
||||
set var $id = ($id + 1) & $id_mask
|
||||
end
|
||||
end
|
||||
document dmesg
|
||||
|
@@ -142,7 +142,6 @@ parameter is applicable::
|
||||
NFS Appropriate NFS support is enabled.
|
||||
OF Devicetree is enabled.
|
||||
PV_OPS A paravirtualized kernel is enabled.
|
||||
PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
|
||||
PARISC The PA-RISC architecture is enabled.
|
||||
PCI PCI bus support is enabled.
|
||||
PCIE PCI Express support is enabled.
|
||||
|
@@ -378,18 +378,16 @@
|
||||
autoconf= [IPV6]
|
||||
See Documentation/networking/ipv6.rst.
|
||||
|
||||
show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller
|
||||
Limit apic dumping. The parameter defines the maximal
|
||||
number of local apics being dumped. Also it is possible
|
||||
to set it to "all" by meaning -- no limit here.
|
||||
Format: { 1 (default) | 2 | ... | all }.
|
||||
The parameter valid if only apic=debug or
|
||||
apic=verbose is specified.
|
||||
Example: apic=debug show_lapic=all
|
||||
|
||||
apm= [APM] Advanced Power Management
|
||||
See header of arch/x86/kernel/apm_32.c.
|
||||
|
||||
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
|
||||
Format: { "0" | "1" }
|
||||
See security/apparmor/Kconfig help text
|
||||
0 -- disable.
|
||||
1 -- enable.
|
||||
Default value is set via kernel config option.
|
||||
|
||||
arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
|
||||
Format: <io>,<irq>,<nodeID>
|
||||
|
||||
@@ -480,8 +478,10 @@
|
||||
See Documentation/block/cmdline-partition.rst
|
||||
|
||||
boot_delay= Milliseconds to delay each printk during boot.
|
||||
Values larger than 10 seconds (10000) are changed to
|
||||
no delay (0).
|
||||
Only works if CONFIG_BOOT_PRINTK_DELAY is enabled,
|
||||
and you may also have to specify "lpj=". Boot_delay
|
||||
values larger than 10 seconds (10000) are assumed
|
||||
erroneous and ignored.
|
||||
Format: integer
|
||||
|
||||
bootconfig [KNL]
|
||||
@@ -557,6 +557,7 @@
|
||||
Format: <string>
|
||||
nosocket -- Disable socket memory accounting.
|
||||
nokmem -- Disable kernel memory accounting.
|
||||
nobpf -- Disable BPF memory accounting.
|
||||
|
||||
checkreqprot= [SELINUX] Set initial checkreqprot flag value.
|
||||
Format: { "0" | "1" }
|
||||
@@ -672,7 +673,7 @@
|
||||
Sets the size of kernel per-numa memory area for
|
||||
contiguous memory allocations. A value of 0 disables
|
||||
per-numa CMA altogether. And If this option is not
|
||||
specificed, the default value is 0.
|
||||
specified, the default value is 0.
|
||||
With per-numa CMA enabled, DMA users on node nid will
|
||||
first try to allocate buffer from the pernuma area
|
||||
which is located in node nid, if the allocation fails,
|
||||
@@ -944,7 +945,7 @@
|
||||
driver code when a CPU writes to (or reads from) a
|
||||
random memory location. Note that there exists a class
|
||||
of memory corruptions problems caused by buggy H/W or
|
||||
F/W or by drivers badly programing DMA (basically when
|
||||
F/W or by drivers badly programming DMA (basically when
|
||||
memory is written at bus level and the CPU MMU is
|
||||
bypassed) which are not detectable by
|
||||
CONFIG_DEBUG_PAGEALLOC, hence this option will not help
|
||||
@@ -1045,26 +1046,12 @@
|
||||
can be useful when debugging issues that require an SLB
|
||||
miss to occur.
|
||||
|
||||
stress_slb [PPC]
|
||||
Limits the number of kernel SLB entries, and flushes
|
||||
them frequently to increase the rate of SLB faults
|
||||
on kernel addresses.
|
||||
|
||||
stress_hpt [PPC]
|
||||
Limits the number of kernel HPT entries in the hash
|
||||
page table to increase the rate of hash page table
|
||||
faults on kernel addresses.
|
||||
|
||||
disable= [IPV6]
|
||||
See Documentation/networking/ipv6.rst.
|
||||
|
||||
disable_radix [PPC]
|
||||
Disable RADIX MMU mode on POWER9
|
||||
|
||||
radix_hcall_invalidate=on [PPC/PSERIES]
|
||||
Disable RADIX GTSE feature and use hcall for TLB
|
||||
invalidate.
|
||||
|
||||
disable_tlbie [PPC]
|
||||
Disable TLBIE instruction. Currently does not work
|
||||
with KVM, with HASH MMU, or with coherent accelerators.
|
||||
@@ -1166,16 +1153,6 @@
|
||||
Documentation/admin-guide/dynamic-debug-howto.rst
|
||||
for details.
|
||||
|
||||
nopku [X86] Disable Memory Protection Keys CPU feature found
|
||||
in some Intel CPUs.
|
||||
|
||||
<module>.async_probe[=<bool>] [KNL]
|
||||
If no <bool> value is specified or if the value
|
||||
specified is not a valid <bool>, enable asynchronous
|
||||
probe on this module. Otherwise, enable/disable
|
||||
asynchronous probe on this module as indicated by the
|
||||
<bool> value. See also: module.async_probe
|
||||
|
||||
early_ioremap_debug [KNL]
|
||||
Enable debug messages in early_ioremap support. This
|
||||
is useful for tracking down temporary early mappings
|
||||
@@ -1195,10 +1172,10 @@
|
||||
specified, the serial port must already be setup and
|
||||
configured.
|
||||
|
||||
uart[8250],io,<addr>[,options]
|
||||
uart[8250],mmio,<addr>[,options]
|
||||
uart[8250],mmio32,<addr>[,options]
|
||||
uart[8250],mmio32be,<addr>[,options]
|
||||
uart[8250],io,<addr>[,options[,uartclk]]
|
||||
uart[8250],mmio,<addr>[,options[,uartclk]]
|
||||
uart[8250],mmio32,<addr>[,options[,uartclk]]
|
||||
uart[8250],mmio32be,<addr>[,options[,uartclk]]
|
||||
uart[8250],0x<addr>[,options]
|
||||
Start an early, polled-mode console on the 8250/16550
|
||||
UART at the specified I/O port or MMIO address.
|
||||
@@ -1207,7 +1184,9 @@
|
||||
If none of [io|mmio|mmio32|mmio32be], <addr> is assumed
|
||||
to be equivalent to 'mmio'. 'options' are specified
|
||||
in the same format described for "console=ttyS<n>"; if
|
||||
unspecified, the h/w is not initialized.
|
||||
unspecified, the h/w is not initialized. 'uartclk' is
|
||||
the uart clock frequency; if unspecified, it is set
|
||||
to 'BASE_BAUD' * 16.
|
||||
|
||||
pl011,<addr>
|
||||
pl011,mmio32,<addr>
|
||||
@@ -1532,6 +1511,15 @@
|
||||
boot up that is likely to be overridden by user space
|
||||
start up functionality.
|
||||
|
||||
Optionally, the snapshot can also be defined for a tracing
|
||||
instance that was created by the trace_instance= command
|
||||
line parameter.
|
||||
|
||||
trace_instance=foo,sched_switch ftrace_boot_snapshot=foo
|
||||
|
||||
The above will cause the "foo" tracing instance to trigger
|
||||
a snapshot at the end of boot up.
|
||||
|
||||
ftrace_dump_on_oops[=orig_cpu]
|
||||
[FTRACE] will dump the trace buffers on oops.
|
||||
If no parameter is passed, ftrace will dump
|
||||
@@ -1752,7 +1740,7 @@
|
||||
boot-time allocation of gigantic hugepages is skipped.
|
||||
|
||||
hugetlb_free_vmemmap=
|
||||
[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
[KNL] Requires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
enabled.
|
||||
Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
|
||||
Allows heavy hugetlb users to free up some more
|
||||
@@ -1791,12 +1779,6 @@
|
||||
which allow the hypervisor to 'idle' the
|
||||
guest on lock contention.
|
||||
|
||||
keep_bootcon [KNL]
|
||||
Do not unregister boot console at start. This is only
|
||||
useful for debugging when something happens in the window
|
||||
between unregistering the boot console and initializing
|
||||
the real console.
|
||||
|
||||
i2c_bus= [HW] Override the default board specific I2C bus speed
|
||||
or register an additional I2C bus that is not
|
||||
registered from board initialization code.
|
||||
@@ -2366,17 +2348,18 @@
|
||||
js= [HW,JOY] Analog joystick
|
||||
See Documentation/input/joydev/joystick.rst.
|
||||
|
||||
nokaslr [KNL]
|
||||
When CONFIG_RANDOMIZE_BASE is set, this disables
|
||||
kernel and module base offset ASLR (Address Space
|
||||
Layout Randomization).
|
||||
|
||||
kasan_multi_shot
|
||||
[KNL] Enforce KASAN (Kernel Address Sanitizer) to print
|
||||
report on every invalid memory access. Without this
|
||||
parameter KASAN will print report only for the first
|
||||
invalid access.
|
||||
|
||||
keep_bootcon [KNL]
|
||||
Do not unregister boot console at start. This is only
|
||||
useful for debugging when something happens in the window
|
||||
between unregistering the boot console and initializing
|
||||
the real console.
|
||||
|
||||
keepinitrd [HW,ARM]
|
||||
|
||||
kernelcore= [KNL,X86,IA-64,PPC]
|
||||
@@ -2553,9 +2536,14 @@
|
||||
protected: nVHE-based mode with support for guests whose
|
||||
state is kept private from the host.
|
||||
|
||||
nested: VHE-based mode with support for nested
|
||||
virtualization. Requires at least ARMv8.3
|
||||
hardware.
|
||||
|
||||
Defaults to VHE/nVHE based on hardware support. Setting
|
||||
mode to "protected" will disable kexec and hibernation
|
||||
for the host.
|
||||
for the host. "nested" is experimental and should be
|
||||
used with extreme caution.
|
||||
|
||||
kvm-arm.vgic_v3_group0_trap=
|
||||
[KVM,ARM] Trap guest accesses to GICv3 group-0
|
||||
@@ -2816,6 +2804,9 @@
|
||||
* [no]setxfer: Indicate if transfer speed mode setting
|
||||
should be skipped.
|
||||
|
||||
* [no]fua: Disable or enable FUA (Force Unit Access)
|
||||
support for devices supporting this feature.
|
||||
|
||||
* dump_id: Dump IDENTIFY data.
|
||||
|
||||
* disable: Disable this device.
|
||||
@@ -3325,6 +3316,13 @@
|
||||
For details see:
|
||||
Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
|
||||
|
||||
<module>.async_probe[=<bool>] [KNL]
|
||||
If no <bool> value is specified or if the value
|
||||
specified is not a valid <bool>, enable asynchronous
|
||||
probe on this module. Otherwise, enable/disable
|
||||
asynchronous probe on this module as indicated by the
|
||||
<bool> value. See also: module.async_probe
|
||||
|
||||
module.async_probe=<bool>
|
||||
[KNL] When set to true, modules will use async probing
|
||||
by default. To enable/disable async probing for a
|
||||
@@ -3708,7 +3706,7 @@
|
||||
implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP
|
||||
to be effective. This is useful on platforms where the
|
||||
sleep(SH) or wfi(ARM,ARM64) instructions do not work
|
||||
correctly or when doing power measurements to evalute
|
||||
correctly or when doing power measurements to evaluate
|
||||
the impact of the sleep instructions. This is also
|
||||
useful when using JTAG debugger.
|
||||
|
||||
@@ -3779,6 +3777,11 @@
|
||||
|
||||
nojitter [IA-64] Disables jitter checking for ITC timers.
|
||||
|
||||
nokaslr [KNL]
|
||||
When CONFIG_RANDOMIZE_BASE is set, this disables
|
||||
kernel and module base offset ASLR (Address Space
|
||||
Layout Randomization).
|
||||
|
||||
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
|
||||
|
||||
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
|
||||
@@ -3824,6 +3827,19 @@
|
||||
|
||||
nopcid [X86-64] Disable the PCID cpu feature.
|
||||
|
||||
nopku [X86] Disable Memory Protection Keys CPU feature found
|
||||
in some Intel CPUs.
|
||||
|
||||
nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
|
||||
Disables the PV optimizations forcing the guest to run
|
||||
as generic guest with no PV drivers. Currently support
|
||||
XEN HVM, KVM, HYPER_V and VMWARE guest.
|
||||
|
||||
nopvspin [X86,XEN,KVM]
|
||||
Disables the qspinlock slow path using PV optimizations
|
||||
which allow the hypervisor to 'idle' the guest on lock
|
||||
contention.
|
||||
|
||||
norandmaps Don't use address space randomization. Equivalent to
|
||||
echo 0 > /proc/sys/kernel/randomize_va_space
|
||||
|
||||
@@ -4117,10 +4133,6 @@
|
||||
|
||||
pcbit= [HW,ISDN]
|
||||
|
||||
pcd. [PARIDE]
|
||||
See header of drivers/block/paride/pcd.c.
|
||||
See also Documentation/admin-guide/blockdev/paride.rst.
|
||||
|
||||
pci=option[,option...] [PCI] various PCI subsystem options.
|
||||
|
||||
Some options herein operate on a specific device
|
||||
@@ -4296,7 +4308,9 @@
|
||||
specified, e.g., 12@pci:8086:9c22:103c:198f
|
||||
for 4096-byte alignment.
|
||||
ecrc= Enable/disable PCIe ECRC (transaction layer
|
||||
end-to-end CRC checking).
|
||||
end-to-end CRC checking). Only effective if
|
||||
OS has native AER control (either granted by
|
||||
ACPI _OSC or forced via "pcie_ports=native")
|
||||
bios: Use BIOS/firmware settings. This is the
|
||||
the default.
|
||||
off: Turn ECRC off
|
||||
@@ -4383,9 +4397,6 @@
|
||||
for debug and development, but should not be
|
||||
needed on a platform with proper driver support.
|
||||
|
||||
pd. [PARIDE]
|
||||
See Documentation/admin-guide/blockdev/paride.rst.
|
||||
|
||||
pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at
|
||||
boot time.
|
||||
Format: { 0 | 1 }
|
||||
@@ -4398,12 +4409,6 @@
|
||||
allocator. This parameter is primarily for debugging
|
||||
and performance comparison.
|
||||
|
||||
pf. [PARIDE]
|
||||
See Documentation/admin-guide/blockdev/paride.rst.
|
||||
|
||||
pg. [PARIDE]
|
||||
See Documentation/admin-guide/blockdev/paride.rst.
|
||||
|
||||
pirq= [SMP,APIC] Manual mp-table setup
|
||||
See Documentation/x86/i386/IO-APIC.rst.
|
||||
|
||||
@@ -4565,9 +4570,6 @@
|
||||
|
||||
pstore.backend= Specify the name of the pstore backend to use
|
||||
|
||||
pt. [PARIDE]
|
||||
See Documentation/admin-guide/blockdev/paride.rst.
|
||||
|
||||
pti= [X86-64] Control Page Table Isolation of user and
|
||||
kernel address spaces. Disabling this feature
|
||||
removes hardening, but improves performance of
|
||||
@@ -4591,6 +4593,10 @@
|
||||
|
||||
r128= [HW,DRM]
|
||||
|
||||
radix_hcall_invalidate=on [PPC/PSERIES]
|
||||
Disable RADIX GTSE feature and use hcall for TLB
|
||||
invalidate.
|
||||
|
||||
raid= [HW,RAID]
|
||||
See Documentation/admin-guide/md.rst.
|
||||
|
||||
@@ -5113,6 +5119,17 @@
|
||||
rcupdate.rcu_cpu_stall_timeout to be used (after
|
||||
conversion from seconds to milliseconds).
|
||||
|
||||
rcupdate.rcu_cpu_stall_cputime= [KNL]
|
||||
Provide statistics on the cputime and count of
|
||||
interrupts and tasks during the sampling period. For
|
||||
multiple continuous RCU stalls, all sampling periods
|
||||
begin at half of the first RCU stall timeout.
|
||||
|
||||
rcupdate.rcu_exp_stall_task_details= [KNL]
|
||||
Print stack dumps of any tasks blocking the
|
||||
current expedited RCU grace period during an
|
||||
expedited RCU CPU stall warning.
|
||||
|
||||
rcupdate.rcu_expedited= [KNL]
|
||||
Use expedited grace-period primitives, for
|
||||
example, synchronize_rcu_expedited() instead
|
||||
@@ -5221,7 +5238,7 @@
|
||||
rdt= [HW,X86,RDT]
|
||||
Turn on/off individual RDT features. List is:
|
||||
cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
|
||||
mba.
|
||||
mba, smba, bmec.
|
||||
E.g. to turn on cmt and turn off mba use:
|
||||
rdt=cmt,!mba
|
||||
|
||||
@@ -5572,13 +5589,6 @@
|
||||
1 -- enable.
|
||||
Default value is 1.
|
||||
|
||||
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
|
||||
Format: { "0" | "1" }
|
||||
See security/apparmor/Kconfig help text
|
||||
0 -- disable.
|
||||
1 -- enable.
|
||||
Default value is set via kernel config option.
|
||||
|
||||
serialnumber [BUGS=X86-32]
|
||||
|
||||
sev=option[,option...] [X86-64] See Documentation/x86/x86_64/boot-options.rst
|
||||
@@ -5586,6 +5596,15 @@
|
||||
shapers= [NET]
|
||||
Maximal number of shapers.
|
||||
|
||||
show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller
|
||||
Limit apic dumping. The parameter defines the maximal
|
||||
number of local apics being dumped. Also it is possible
|
||||
to set it to "all" by meaning -- no limit here.
|
||||
Format: { 1 (default) | 2 | ... | all }.
|
||||
The parameter valid if only apic=debug or
|
||||
apic=verbose is specified.
|
||||
Example: apic=debug show_lapic=all
|
||||
|
||||
simeth= [IA-64]
|
||||
simscsi=
|
||||
|
||||
@@ -5729,9 +5748,9 @@
|
||||
retpoline,generic - Retpolines
|
||||
retpoline,lfence - LFENCE; indirect branch
|
||||
retpoline,amd - alias for retpoline,lfence
|
||||
eibrs - enhanced IBRS
|
||||
eibrs,retpoline - enhanced IBRS + Retpolines
|
||||
eibrs,lfence - enhanced IBRS + LFENCE
|
||||
eibrs - Enhanced/Auto IBRS
|
||||
eibrs,retpoline - Enhanced/Auto IBRS + Retpolines
|
||||
eibrs,lfence - Enhanced/Auto IBRS + LFENCE
|
||||
ibrs - use IBRS to protect kernel
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
@@ -6025,6 +6044,16 @@
|
||||
be used to filter out binaries which have
|
||||
not yet been made aware of AT_MINSIGSTKSZ.
|
||||
|
||||
stress_hpt [PPC]
|
||||
Limits the number of kernel HPT entries in the hash
|
||||
page table to increase the rate of hash page table
|
||||
faults on kernel addresses.
|
||||
|
||||
stress_slb [PPC]
|
||||
Limits the number of kernel SLB entries, and flushes
|
||||
them frequently to increase the rate of SLB faults
|
||||
on kernel addresses.
|
||||
|
||||
sunrpc.min_resvport=
|
||||
sunrpc.max_resvport=
|
||||
[NFS,SUNRPC]
|
||||
@@ -6272,13 +6301,33 @@
|
||||
comma-separated list of trace events to enable. See
|
||||
also Documentation/trace/events.rst
|
||||
|
||||
trace_instance=[instance-info]
|
||||
[FTRACE] Create a ring buffer instance early in boot up.
|
||||
This will be listed in:
|
||||
|
||||
/sys/kernel/tracing/instances
|
||||
|
||||
Events can be enabled at the time the instance is created
|
||||
via:
|
||||
|
||||
trace_instance=<name>,<system1>:<event1>,<system2>:<event2>
|
||||
|
||||
Note, the "<system*>:" portion is optional if the event is
|
||||
unique.
|
||||
|
||||
trace_instance=foo,sched:sched_switch,irq_handler_entry,initcall
|
||||
|
||||
will enable the "sched_switch" event (note, the "sched:" is optional, and
|
||||
the same thing would happen if it was left off). The irq_handler_entry
|
||||
event, and all events under the "initcall" system.
|
||||
|
||||
trace_options=[option-list]
|
||||
[FTRACE] Enable or disable tracer options at boot.
|
||||
The option-list is a comma delimited list of options
|
||||
that can be enabled or disabled just as if you were
|
||||
to echo the option name into
|
||||
|
||||
/sys/kernel/debug/tracing/trace_options
|
||||
/sys/kernel/tracing/trace_options
|
||||
|
||||
For example, to enable stacktrace option (to dump the
|
||||
stack trace of each event), add to the command line:
|
||||
@@ -6311,7 +6360,7 @@
|
||||
[FTRACE] enable this option to disable tracing when a
|
||||
warning is hit. This turns off "tracing_on". Tracing can
|
||||
be enabled again by echoing '1' into the "tracing_on"
|
||||
file located in /sys/kernel/debug/tracing/
|
||||
file located in /sys/kernel/tracing/
|
||||
|
||||
This option is useful, as it disables the trace before
|
||||
the WARNING dump is called, which prevents the trace to
|
||||
@@ -6369,6 +6418,16 @@
|
||||
in situations with strict latency requirements (where
|
||||
interruptions from clocksource watchdog are not
|
||||
acceptable).
|
||||
[x86] recalibrate: force recalibration against a HW timer
|
||||
(HPET or PM timer) on systems whose TSC frequency was
|
||||
obtained from HW or FW using either an MSR or CPUID(0x15).
|
||||
Warn if the difference is more than 500 ppm.
|
||||
[x86] watchdog: Use TSC as the watchdog clocksource with
|
||||
which to check other HW timers (HPET or PM timer), but
|
||||
only on systems where TSC has been deemed trustworthy.
|
||||
This will be suppressed by an earlier tsc=nowatchdog and
|
||||
can be overridden by a later tsc=nowatchdog. A console
|
||||
message will flag any such suppression or overriding.
|
||||
|
||||
tsc_early_khz= [X86] Skip early TSC calibration and use the given
|
||||
value instead. Useful when the early TSC frequency discovery
|
||||
@@ -6756,11 +6815,11 @@
|
||||
functions are at fixed addresses, they make nice
|
||||
targets for exploits that can control RIP.
|
||||
|
||||
emulate [default] Vsyscalls turn into traps and are
|
||||
emulated reasonably safely. The vsyscall
|
||||
page is readable.
|
||||
emulate Vsyscalls turn into traps and are emulated
|
||||
reasonably safely. The vsyscall page is
|
||||
readable.
|
||||
|
||||
xonly Vsyscalls turn into traps and are
|
||||
xonly [default] Vsyscalls turn into traps and are
|
||||
emulated reasonably safely. The vsyscall
|
||||
page is not readable.
|
||||
|
||||
@@ -6957,16 +7016,6 @@
|
||||
fairer and the number of possible event channels is
|
||||
much higher. Default is on (use fifo events).
|
||||
|
||||
nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
|
||||
Disables the PV optimizations forcing the guest to run
|
||||
as generic guest with no PV drivers. Currently support
|
||||
XEN HVM, KVM, HYPER_V and VMWARE guest.
|
||||
|
||||
nopvspin [X86,XEN,KVM]
|
||||
Disables the qspinlock slow path using PV optimizations
|
||||
which allow the hypervisor to 'idle' the guest on lock
|
||||
contention.
|
||||
|
||||
xirc2ps_cs= [NET,PCMCIA]
|
||||
Format:
|
||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||
@@ -7020,3 +7069,10 @@
|
||||
management firmware translates the requests into actual
|
||||
hardware states (core frequency, data fabric and memory
|
||||
clocks etc.)
|
||||
active
|
||||
Use amd_pstate_epp driver instance as the scaling driver,
|
||||
driver provides a hint to the hardware if software wants
|
||||
to bias toward performance (0x0) or energy efficiency (0xff)
|
||||
to the CPPC firmware. then CPPC power algorithm will
|
||||
calculate the runtime workload and adjust the realtime cores
|
||||
frequency.
|
||||
|
@@ -25,7 +25,7 @@ References
|
||||
|
||||
- In order to locate kernel-generated OS jitter on CPU N:
|
||||
|
||||
cd /sys/kernel/debug/tracing
|
||||
cd /sys/kernel/tracing
|
||||
echo 1 > max_graph_depth # Increase the "1" for more detail
|
||||
echo function_graph > current_tracer
|
||||
# run workload
|
||||
|
@@ -1488,7 +1488,7 @@ Example of command to set keyboard language is mentioned below::
|
||||
Text corresponding to keyboard layout to be set in sysfs are: be(Belgian),
|
||||
cz(Czech), da(Danish), de(German), en(English), es(Spain), et(Estonian),
|
||||
fr(French), fr-ch(French(Switzerland)), hu(Hungarian), it(Italy), jp (Japan),
|
||||
nl(Dutch), nn(Norway), pl(Polish), pt(portugese), sl(Slovenian), sv(Sweden),
|
||||
nl(Dutch), nn(Norway), pl(Polish), pt(portuguese), sl(Slovenian), sv(Sweden),
|
||||
tr(Turkey)
|
||||
|
||||
WWAN Antenna type
|
||||
|
@@ -317,7 +317,7 @@ All md devices contain:
|
||||
suspended (not supported yet)
|
||||
All IO requests will block. The array can be reconfigured.
|
||||
|
||||
Writing this, if accepted, will block until array is quiessent
|
||||
Writing this, if accepted, will block until array is quiescent
|
||||
|
||||
readonly
|
||||
no resync can happen. no superblocks get written.
|
||||
|
@@ -909,7 +909,7 @@ DE hat diverse Treiber fuer diese Modelle (Stand 09/2002):
|
||||
- TVPhone98 (Bt878)
|
||||
- AVerTV und TVCapture98 w/VCR (Bt 878)
|
||||
- AVerTVStudio und TVPhone98 w/VCR (Bt878)
|
||||
- AVerTV GO Serie (Kein SVideo Input)
|
||||
- AVerTV GO Series (Kein SVideo Input)
|
||||
- AVerTV98 (BT-878 chip)
|
||||
- AVerTV98 mit Fernbedienung (BT-878 chip)
|
||||
- AVerTV/FM98 (BT-878 chip)
|
||||
|
@@ -137,7 +137,7 @@ The ``LIRC user interface`` option adds enhanced functionality when using the
|
||||
from remote controllers.
|
||||
|
||||
The ``Support for eBPF programs attached to lirc devices`` option allows
|
||||
the usage of special programs (called eBPF) that would allow aplications
|
||||
the usage of special programs (called eBPF) that would allow applications
|
||||
to add extra remote controller decoding functionality to the Linux Kernel.
|
||||
|
||||
The ``Remote controller decoders`` option allows selecting the
|
||||
|
@@ -55,6 +55,15 @@ Miscellaneous:
|
||||
you can control the CEC line through this driver. This supports error
|
||||
injection as well.
|
||||
|
||||
- cec-gpio and Allwinner A10 (or any other driver that uses the CEC pin
|
||||
framework to drive the CEC pin directly): the CEC pin framework uses
|
||||
high-resolution timers. These timers are affected by NTP daemons that
|
||||
speed up or slow down the clock to sync with the official time. The
|
||||
chronyd server will by default increase or decrease the clock by
|
||||
1/12th. This will cause the CEC timings to go out of spec. To fix this,
|
||||
add a 'maxslewrate 40000' line to chronyd.conf. This limits the clock
|
||||
frequency change to 1/25th, which keeps the CEC timings within spec.
|
||||
|
||||
|
||||
Utilities
|
||||
=========
|
||||
@@ -296,69 +305,71 @@ broadcast messages twice to reduce the chance of them being lost. Specifically
|
||||
Making a CEC debugger
|
||||
=====================
|
||||
|
||||
By using a Raspberry Pi 2B/3/4 and some cheap components you can make
|
||||
By using a Raspberry Pi 4B and some cheap components you can make
|
||||
your own low-level CEC debugger.
|
||||
|
||||
Here is a picture of my setup:
|
||||
|
||||
https://hverkuil.home.xs4all.nl/rpi3-cec.jpg
|
||||
|
||||
It's a Raspberry Pi 3 together with a breadboard and some breadboard wires:
|
||||
|
||||
http://www.dx.com/p/diy-40p-male-to-female-male-to-male-female-to-female-dupont-line-wire-3pcs-356089#.WYLOOXWGN7I
|
||||
|
||||
Finally on of these HDMI female-female passthrough connectors (full soldering type 1):
|
||||
The critical component is one of these HDMI female-female passthrough connectors
|
||||
(full soldering type 1):
|
||||
|
||||
https://elabbay.myshopify.com/collections/camera/products/hdmi-af-af-v1a-hdmi-type-a-female-to-hdmi-type-a-female-pass-through-adapter-breakout-board?variant=45533926147
|
||||
|
||||
We've tested this and it works up to 4kp30 (297 MHz). The quality is not high
|
||||
enough to pass-through 4kp60 (594 MHz).
|
||||
The video quality is variable and certainly not enough to pass-through 4kp60
|
||||
(594 MHz) video. You might be able to support 4kp30, but more likely you will
|
||||
be limited to 1080p60 (148.5 MHz). But for CEC testing that is fine.
|
||||
|
||||
I also added an RTC and a breakout shield:
|
||||
You need a breadboard and some breadboard wires:
|
||||
|
||||
https://www.amazon.com/Makerfire%C2%AE-Raspberry-Module-DS1307-Battery/dp/B00ZOXWHK4
|
||||
http://www.dx.com/p/diy-40p-male-to-female-male-to-male-female-to-female-dupont-line-wire-3pcs-356089#.WYLOOXWGN7I
|
||||
|
||||
https://www.dx.com/p/raspberry-pi-gpio-expansion-board-breadboard-easy-multiplexing-board-one-to-three-with-screw-for-raspberry-pi-2-3-b-b-2729992.html#.YGRCG0MzZ7I
|
||||
|
||||
These two are not needed but they make life a bit easier.
|
||||
|
||||
If you want to monitor the HPD line as well, then you need one of these
|
||||
level shifters:
|
||||
If you want to monitor the HPD and/or 5V lines as well, then you need one of
|
||||
these 5V to 3.3V level shifters:
|
||||
|
||||
https://www.adafruit.com/product/757
|
||||
|
||||
(This is just where I got these components, there are many other places you
|
||||
can get similar things).
|
||||
|
||||
The CEC pin of the HDMI connector needs to be connected to these pins:
|
||||
CE0/IO8 and CE1/IO7 (pull-up GPIOs). The (optional) HPD pin of the HDMI
|
||||
connector should be connected (via a level shifter to convert the 5V
|
||||
to 3.3V) to these pins: IO17 and IO27. The (optional) 5V pin of the HDMI
|
||||
connector should be connected (via a level shifter) to these pins: IO22
|
||||
and IO24. Monitoring the HPD an 5V lines is not necessary, but it is helpful.
|
||||
The ground pin of the HDMI connector needs to be connected to a ground
|
||||
pin of the Raspberry Pi, of course.
|
||||
|
||||
This kernel patch will hook up the cec-gpio driver correctly to
|
||||
e.g. ``arch/arm/boot/dts/bcm2837-rpi-3-b-plus.dts``::
|
||||
The CEC pin of the HDMI connector needs to be connected to these pins:
|
||||
GPIO 6 and GPIO 7. The optional HPD pin of the HDMI connector should
|
||||
be connected via the level shifter to these pins: GPIO 23 and GPIO 12.
|
||||
The optional 5V pin of the HDMI connector should be connected via the
|
||||
level shifter to these pins: GPIO 25 and GPIO 22. Monitoring the HPD and
|
||||
5V lines is not necessary, but it is helpful.
|
||||
|
||||
This device tree addition in ``arch/arm/boot/dts/bcm2711-rpi-4-b.dts``
|
||||
will hook up the cec-gpio driver correctly::
|
||||
|
||||
cec@6 {
|
||||
compatible = "cec-gpio";
|
||||
cec-gpios = <&gpio 6 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
hpd-gpios = <&gpio 23 GPIO_ACTIVE_HIGH>;
|
||||
v5-gpios = <&gpio 25 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
cec@7 {
|
||||
compatible = "cec-gpio";
|
||||
cec-gpios = <&gpio 7 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
hpd-gpios = <&gpio 17 GPIO_ACTIVE_HIGH>;
|
||||
hpd-gpios = <&gpio 12 GPIO_ACTIVE_HIGH>;
|
||||
v5-gpios = <&gpio 22 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
cec@8 {
|
||||
compatible = "cec-gpio";
|
||||
cec-gpios = <&gpio 8 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
hpd-gpios = <&gpio 27 GPIO_ACTIVE_HIGH>;
|
||||
v5-gpios = <&gpio 24 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
If you haven't hooked up the HPD and/or 5V lines, then just delete those
|
||||
lines.
|
||||
|
||||
This dts change will enable two cec GPIO devices: I typically use one to
|
||||
send/receive CEC commands and the other to monitor. If you monitor using
|
||||
an unconfigured CEC adapter then it will use GPIO interrupts which makes
|
||||
monitoring very accurate.
|
||||
|
||||
If you just want to monitor traffic, then a single instance is sufficient.
|
||||
The minimum configuration is one HDMI female-female passthrough connector
|
||||
and two female-female breadboard wires: one for connecting the HDMI ground
|
||||
pin to a ground pin on the Raspberry Pi, and the other to connect the HDMI
|
||||
CEC pin to GPIO 6 on the Raspberry Pi.
|
||||
|
||||
The documentation on how to use the error injection is here: :ref:`cec_pin_error_inj`.
|
||||
|
||||
``cec-ctl --monitor-pin`` will do low-level CEC bus sniffing and analysis.
|
||||
|
@@ -1,65 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
The VPBE V4L2 driver design
|
||||
===========================
|
||||
|
||||
Functional partitioning
|
||||
-----------------------
|
||||
|
||||
Consists of the following:
|
||||
|
||||
1. V4L2 display driver
|
||||
|
||||
Implements creation of video2 and video3 device nodes and
|
||||
provides v4l2 device interface to manage VID0 and VID1 layers.
|
||||
|
||||
2. Display controller
|
||||
|
||||
Loads up VENC, OSD and external encoders such as ths8200. It provides
|
||||
a set of API calls to V4L2 drivers to set the output/standards
|
||||
in the VENC or external sub devices. It also provides
|
||||
a device object to access the services from OSD subdevice
|
||||
using sub device ops. The connection of external encoders to VENC LCD
|
||||
controller port is done at init time based on default output and standard
|
||||
selection or at run time when application change the output through
|
||||
V4L2 IOCTLs.
|
||||
|
||||
When connected to an external encoder, vpbe controller is also responsible
|
||||
for setting up the interface between VENC and external encoders based on
|
||||
board specific settings (specified in board-xxx-evm.c). This allows
|
||||
interfacing external encoders such as ths8200. The setup_if_config()
|
||||
is implemented for this as well as configure_venc() (part of the next patch)
|
||||
API to set timings in VENC for a specific display resolution. As of this
|
||||
patch series, the interconnection and enabling and setting of the external
|
||||
encoders is not present, and would be a part of the next patch series.
|
||||
|
||||
3. VENC subdevice module
|
||||
|
||||
Responsible for setting outputs provided through internal DACs and also
|
||||
setting timings at LCD controller port when external encoders are connected
|
||||
at the port or LCD panel timings required. When external encoder/LCD panel
|
||||
is connected, the timings for a specific standard/preset is retrieved from
|
||||
the board specific table and the values are used to set the timings in
|
||||
venc using non-standard timing mode.
|
||||
|
||||
Support LCD Panel displays using the VENC. For example to support a Logic
|
||||
PD display, it requires setting up the LCD controller port with a set of
|
||||
timings for the resolution supported and setting the dot clock. So we could
|
||||
add the available outputs as a board specific entry (i.e add the "LogicPD"
|
||||
output name to board-xxx-evm.c). A table of timings for various LCDs
|
||||
supported can be maintained in the board specific setup file to support
|
||||
various LCD displays.As of this patch a basic driver is present, and this
|
||||
support for external encoders and displays forms a part of the next
|
||||
patch series.
|
||||
|
||||
4. OSD module
|
||||
|
||||
OSD module implements all OSD layer management and hardware specific
|
||||
features. The VPBE module interacts with the OSD for enabling and
|
||||
disabling appropriate features of the OSD.
|
||||
|
||||
Current status
|
||||
--------------
|
||||
|
||||
A fully functional working version of the V4L2 driver is available. This
|
||||
driver has been tested with NTSC and PAL standards and buffer streaming.
|
@@ -72,17 +72,13 @@ imx319 Sony IMX319 sensor
|
||||
imx334 Sony IMX334 sensor
|
||||
imx355 Sony IMX355 sensor
|
||||
imx412 Sony IMX412 sensor
|
||||
m5mols Fujitsu M-5MOLS 8MP sensor
|
||||
mt9m001 mt9m001
|
||||
mt9m032 MT9M032 camera sensor
|
||||
mt9m111 mt9m111, mt9m112 and mt9m131
|
||||
mt9p031 Aptina MT9P031
|
||||
mt9t001 Aptina MT9T001
|
||||
mt9t112 Aptina MT9T111/MT9T112
|
||||
mt9v011 Micron mt9v011 sensor
|
||||
mt9v032 Micron MT9V032 sensor
|
||||
mt9v111 Aptina MT9V111 sensor
|
||||
noon010pc30 Siliconfile NOON010PC30 sensor
|
||||
ov13858 OmniVision OV13858 sensor
|
||||
ov13b10 OmniVision OV13B10 sensor
|
||||
ov2640 OmniVision OV2640 sensor
|
||||
@@ -109,9 +105,6 @@ s5c73m3 Samsung S5C73M3 sensor
|
||||
s5k4ecgx Samsung S5K4ECGX sensor
|
||||
s5k5baf Samsung S5K5BAF sensor
|
||||
s5k6a3 Samsung S5K6A3 sensor
|
||||
s5k6aa Samsung S5K6AAFX sensor
|
||||
sr030pc30 Siliconfile SR030PC30 sensor
|
||||
vs6624 ST VS6624 sensor
|
||||
============ ==========================================================
|
||||
|
||||
Flash devices
|
||||
@@ -222,7 +215,6 @@ Video encoders
|
||||
============ ==========================================================
|
||||
Driver Name
|
||||
============ ==========================================================
|
||||
ad9389b Analog Devices AD9389B encoder
|
||||
adv7170 Analog Devices ADV7170 video encoder
|
||||
adv7175 Analog Devices ADV7175 video encoder
|
||||
adv7343 ADV7343 video encoder
|
||||
|
@@ -72,7 +72,6 @@ via-camera VIAFB camera controller
|
||||
video-mux Video Multiplexer
|
||||
vpif_display TI DaVinci VPIF V4L2-Display
|
||||
vpif_capture TI DaVinci VPIF video capture
|
||||
vpss TI DaVinci VPBE V4L2-Display
|
||||
vsp1 Renesas VSP1 Video Processing Engine
|
||||
xilinx-tpg Xilinx Video Test Pattern Generator
|
||||
xilinx-video Xilinx Video IP (EXPERIMENTAL)
|
||||
|
@@ -142,7 +142,7 @@ The drivers exposes following files:
|
||||
indicator
|
||||
0x18 lassi Signed Low side adjacent Channel
|
||||
Strength indicator
|
||||
0x19 hassi ditto fpr High side
|
||||
0x19 hassi ditto for High side
|
||||
0x20 mult Multipath indicator
|
||||
0x21 dev Frequency deviation
|
||||
0x24 assi Adjacent channel SSI
|
||||
|
@@ -12,7 +12,6 @@ Video4Linux (V4L) driver-specific documentation
|
||||
bttv
|
||||
cafe_ccic
|
||||
cx88
|
||||
davinci-vpbe
|
||||
fimc
|
||||
imx
|
||||
imx7
|
||||
|
@@ -580,7 +580,7 @@ Metadata Capture
|
||||
----------------
|
||||
|
||||
The Metadata capture generates UVC format metadata. The PTS and SCR are
|
||||
transmitted based on the values set in vivid contols.
|
||||
transmitted based on the values set in vivid controls.
|
||||
|
||||
The Metadata device will only work for the Webcam input, it will give
|
||||
back an error for all other inputs.
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _mm_concepts:
|
||||
|
||||
=================
|
||||
Concepts overview
|
||||
=================
|
||||
@@ -86,16 +84,15 @@ memory with the huge pages. The first one is `HugeTLB filesystem`, or
|
||||
hugetlbfs. It is a pseudo filesystem that uses RAM as its backing
|
||||
store. For the files created in this filesystem the data resides in
|
||||
the memory and mapped using huge pages. The hugetlbfs is described at
|
||||
:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`.
|
||||
Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
|
||||
Another, more recent, mechanism that enables use of the huge pages is
|
||||
called `Transparent HugePages`, or THP. Unlike the hugetlbfs that
|
||||
requires users and/or system administrators to configure what parts of
|
||||
the system memory should and can be mapped by the huge pages, THP
|
||||
manages such mappings transparently to the user and hence the
|
||||
name. See
|
||||
:ref:`Documentation/admin-guide/mm/transhuge.rst <admin_guide_transhuge>`
|
||||
for more details about THP.
|
||||
name. See Documentation/admin-guide/mm/transhuge.rst for more details
|
||||
about THP.
|
||||
|
||||
Zones
|
||||
=====
|
||||
@@ -125,8 +122,8 @@ processor. Each bank is referred to as a `node` and for each node Linux
|
||||
constructs an independent memory management subsystem. A node has its
|
||||
own set of zones, lists of free and used pages and various statistics
|
||||
counters. You can find more details about NUMA in
|
||||
:ref:`Documentation/mm/numa.rst <numa>` and in
|
||||
:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
|
||||
Documentation/mm/numa.rst` and in
|
||||
Documentation/admin-guide/mm/numa_memory_policy.rst.
|
||||
|
||||
Page cache
|
||||
==========
|
||||
|
@@ -54,7 +54,7 @@ that is built with ``CONFIG_DAMON_LRU_SORT=y``.
|
||||
To let sysadmins enable or disable it and tune for the given system,
|
||||
DAMON_LRU_SORT utilizes module parameters. That is, you can put
|
||||
``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
|
||||
proper values to ``/sys/modules/damon_lru_sort/parameters/<parameter>`` files.
|
||||
proper values to ``/sys/module/damon_lru_sort/parameters/<parameter>`` files.
|
||||
|
||||
Below are the description of each parameter.
|
||||
|
||||
@@ -283,7 +283,7 @@ doesn't make progress and therefore the free memory rate becomes lower than
|
||||
20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
|
||||
the LRU-list based page granularity reclamation. ::
|
||||
|
||||
# cd /sys/modules/damon_lru_sort/parameters
|
||||
# cd /sys/module/damon_lru_sort/parameters
|
||||
# echo 500 > hot_thres_access_freq
|
||||
# echo 120000000 > cold_min_age
|
||||
# echo 10 > quota_ms
|
||||
|
@@ -46,7 +46,7 @@ that is built with ``CONFIG_DAMON_RECLAIM=y``.
|
||||
To let sysadmins enable or disable it and tune for the given system,
|
||||
DAMON_RECLAIM utilizes module parameters. That is, you can put
|
||||
``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
|
||||
proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
|
||||
proper values to ``/sys/module/damon_reclaim/parameters/<parameter>`` files.
|
||||
|
||||
Below are the description of each parameter.
|
||||
|
||||
@@ -205,6 +205,15 @@ The end physical address of memory region that DAMON_RECLAIM will do work
|
||||
against. That is, DAMON_RECLAIM will find cold memory regions in this region
|
||||
and reclaims. By default, biggest System RAM is used as the region.
|
||||
|
||||
skip_anon
|
||||
---------
|
||||
|
||||
Skip anonymous pages reclamation.
|
||||
|
||||
If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
|
||||
pages. By default, ``N``.
|
||||
|
||||
|
||||
kdamond_pid
|
||||
-----------
|
||||
|
||||
@@ -251,7 +260,7 @@ therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
|
||||
do nothing again, so that we can fall back to the LRU-list based page
|
||||
granularity reclamation. ::
|
||||
|
||||
# cd /sys/modules/damon_reclaim/parameters
|
||||
# cd /sys/module/damon_reclaim/parameters
|
||||
# echo 30000000 > min_age
|
||||
# echo $((1 * 1024 * 1024 * 1024)) > quota_sz
|
||||
# echo 1000 > quota_reset_interval_ms
|
||||
|
@@ -25,10 +25,12 @@ DAMON provides below interfaces for different users.
|
||||
interface provides only simple :ref:`statistics <damos_stats>` for the
|
||||
monitoring results. For detailed monitoring results, DAMON provides a
|
||||
:ref:`tracepoint <tracepoint>`.
|
||||
- *debugfs interface.*
|
||||
- *debugfs interface. (DEPRECATED!)*
|
||||
:ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
|
||||
<sysfs_interface>`. This will be removed after next LTS kernel is released,
|
||||
so users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||
<sysfs_interface>`. This is deprecated, so users should move to the
|
||||
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
|
||||
move, please report your usecase to damon@lists.linux.dev and
|
||||
linux-mm@kvack.org.
|
||||
- *Kernel Space Programming Interface.*
|
||||
:doc:`This </mm/damon/api>` is for kernel space programmers. Using this,
|
||||
users can utilize every feature of DAMON most flexibly and efficiently by
|
||||
@@ -87,6 +89,8 @@ comma (","). ::
|
||||
│ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
|
||||
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
|
||||
│ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ filters/nr_filters
|
||||
│ │ │ │ │ │ │ │ 0/type,matching,memcg_id
|
||||
│ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
|
||||
│ │ │ │ │ │ │ tried_regions/
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
|
||||
@@ -151,6 +155,8 @@ number (``N``) to the file creates the number of child directories named as
|
||||
moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
|
||||
be written to the file.
|
||||
|
||||
.. _sysfs_contexts:
|
||||
|
||||
contexts/<N>/
|
||||
-------------
|
||||
|
||||
@@ -268,21 +274,32 @@ schemes/<N>/
|
||||
------------
|
||||
|
||||
In each scheme directory, five directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``)
|
||||
exist.
|
||||
``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file
|
||||
(``action``) exist.
|
||||
|
||||
The ``action`` file is for setting and getting what action you want to apply to
|
||||
memory regions having specific access pattern of the interest. The keywords
|
||||
that can be written to and read from the file and their meaning are as below.
|
||||
|
||||
- ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
Note that support of each action depends on the running DAMON operations set
|
||||
`implementation <sysfs_contexts>`.
|
||||
|
||||
- ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
|
||||
Supported by ``vaddr`` and ``fvaddr`` operations set.
|
||||
- ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
|
||||
Supported by ``vaddr`` and ``fvaddr`` operations set.
|
||||
- ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
|
||||
Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
|
||||
- ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
|
||||
Supported by ``vaddr`` and ``fvaddr`` operations set.
|
||||
- ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
|
||||
Supported by ``vaddr`` and ``fvaddr`` operations set.
|
||||
- ``lru_prio``: Prioritize the region on its LRU lists.
|
||||
Supported by ``paddr`` operations set.
|
||||
- ``lru_deprio``: Deprioritize the region on its LRU lists.
|
||||
- ``stat``: Do nothing but count the statistics
|
||||
Supported by ``paddr`` operations set.
|
||||
- ``stat``: Do nothing but count the statistics.
|
||||
Supported by all operations sets.
|
||||
|
||||
schemes/<N>/access_pattern/
|
||||
---------------------------
|
||||
@@ -347,6 +364,46 @@ as below.
|
||||
|
||||
The ``interval`` should written in microseconds unit.
|
||||
|
||||
schemes/<N>/filters/
|
||||
--------------------
|
||||
|
||||
Users could know something more than the kernel for specific types of memory.
|
||||
In the case, users could do their own management for the memory and hence
|
||||
doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access
|
||||
pattern of the scheme and/or the monitoring regions for the purpose, but that
|
||||
can be inefficient in some cases. In such cases, users could set non-access
|
||||
pattern driven filters using files in this directory.
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_filters``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each filter. The filters are evaluated
|
||||
in the numeric order.
|
||||
|
||||
Each filter directory contains three files, namely ``type``, ``matcing``, and
|
||||
``memcg_path``. You can write one of two special keywords, ``anon`` for
|
||||
anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of
|
||||
the memory cgroup filtering, you can specify the memory cgroup of the interest
|
||||
by writing the path of the memory cgroup from the cgroups mount point to
|
||||
``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to
|
||||
filter out pages that does or does not match to the type, respectively. Then,
|
||||
the scheme's action will not be applied to the pages that specified to be
|
||||
filtered out.
|
||||
|
||||
For example, below restricts a DAMOS action to be applied to only non-anonymous
|
||||
pages of all memory cgroups except ``/having_care_already``.::
|
||||
|
||||
# echo 2 > nr_filters
|
||||
# # filter out anonymous pages
|
||||
echo anon > 0/type
|
||||
echo Y > 0/matching
|
||||
# # further filter out all cgroups except one at '/having_care_already'
|
||||
echo memcg > 1/type
|
||||
echo /having_care_already > 1/memcg_path
|
||||
echo N > 1/matching
|
||||
|
||||
Note that filters are currently supported only when ``paddr``
|
||||
`implementation <sysfs_contexts>` is being used.
|
||||
|
||||
.. _sysfs_schemes_stats:
|
||||
|
||||
schemes/<N>/stats/
|
||||
@@ -432,13 +489,17 @@ the files as above. Above is only for an example.
|
||||
|
||||
.. _debugfs_interface:
|
||||
|
||||
debugfs Interface
|
||||
=================
|
||||
debugfs Interface (DEPRECATED!)
|
||||
===============================
|
||||
|
||||
.. note::
|
||||
|
||||
DAMON debugfs interface will be removed after next LTS kernel is released, so
|
||||
users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||
THIS IS DEPRECATED!
|
||||
|
||||
DAMON debugfs interface is deprecated, so users should move to the
|
||||
:ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
|
||||
move, please report your usecase to damon@lists.linux.dev and
|
||||
linux-mm@kvack.org.
|
||||
|
||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||
@@ -574,11 +635,15 @@ The ``<action>`` is a predefined integer for memory management actions, which
|
||||
DAMON will apply to the regions having the target access pattern. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if
|
||||
``target`` is ``paddr``.
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if
|
||||
``target`` is ``paddr``.
|
||||
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
|
||||
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if
|
||||
``target`` is ``paddr``.
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if
|
||||
``target`` is ``paddr``.
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
Quota
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _hugetlbpage:
|
||||
|
||||
=============
|
||||
HugeTLB Pages
|
||||
=============
|
||||
@@ -86,7 +84,7 @@ by increasing or decreasing the value of ``nr_hugepages``.
|
||||
|
||||
Note: When the feature of freeing unused vmemmap pages associated with each
|
||||
hugetlb page is enabled, we can fail to free the huge pages triggered by
|
||||
the user when ths system is under memory pressure. Please try again later.
|
||||
the user when the system is under memory pressure. Please try again later.
|
||||
|
||||
Pages that are used as huge pages are reserved inside the kernel and cannot
|
||||
be used for other purposes. Huge pages cannot be swapped out under
|
||||
@@ -313,7 +311,7 @@ memory policy mode--bind, preferred, local or interleave--may be used. The
|
||||
resulting effect on persistent huge page allocation is as follows:
|
||||
|
||||
#. Regardless of mempolicy mode [see
|
||||
:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`],
|
||||
Documentation/admin-guide/mm/numa_memory_policy.rst],
|
||||
persistent huge pages will be distributed across the node or nodes
|
||||
specified in the mempolicy as if "interleave" had been specified.
|
||||
However, if a node in the policy does not contain sufficient contiguous
|
||||
@@ -461,13 +459,13 @@ Examples
|
||||
.. _map_hugetlb:
|
||||
|
||||
``map_hugetlb``
|
||||
see tools/testing/selftests/vm/map_hugetlb.c
|
||||
see tools/testing/selftests/mm/map_hugetlb.c
|
||||
|
||||
``hugepage-shm``
|
||||
see tools/testing/selftests/vm/hugepage-shm.c
|
||||
see tools/testing/selftests/mm/hugepage-shm.c
|
||||
|
||||
``hugepage-mmap``
|
||||
see tools/testing/selftests/vm/hugepage-mmap.c
|
||||
see tools/testing/selftests/mm/hugepage-mmap.c
|
||||
|
||||
The `libhugetlbfs`_ library provides a wide range of userspace tools
|
||||
to help with huge page usability, environment setup, and control.
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _idle_page_tracking:
|
||||
|
||||
==================
|
||||
Idle Page Tracking
|
||||
==================
|
||||
@@ -65,14 +63,13 @@ workload one should:
|
||||
are not reclaimable, he or she can filter them out using
|
||||
``/proc/kpageflags``.
|
||||
|
||||
The page-types tool in the tools/vm directory can be used to assist in this.
|
||||
The page-types tool in the tools/mm directory can be used to assist in this.
|
||||
If the tool is run initially with the appropriate option, it will mark all the
|
||||
queried pages as idle. Subsequent runs of the tool can then show which pages have
|
||||
their idle flag cleared in the interim.
|
||||
|
||||
See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
|
||||
information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
|
||||
``/proc/kpagecgroup``.
|
||||
See Documentation/admin-guide/mm/pagemap.rst for more information about
|
||||
``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``.
|
||||
|
||||
.. _impl_details:
|
||||
|
||||
|
@@ -16,8 +16,7 @@ are described in Documentation/admin-guide/sysctl/vm.rst and in `man 5 proc`_.
|
||||
.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
|
||||
|
||||
Linux memory management has its own jargon and if you are not yet
|
||||
familiar with it, consider reading
|
||||
:ref:`Documentation/admin-guide/mm/concepts.rst <mm_concepts>`.
|
||||
familiar with it, consider reading Documentation/admin-guide/mm/concepts.rst.
|
||||
|
||||
Here we document in detail how to interact with various mechanisms in
|
||||
the Linux memory management.
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _admin_guide_ksm:
|
||||
|
||||
=======================
|
||||
Kernel Samepage Merging
|
||||
=======================
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _admin_guide_memory_hotplug:
|
||||
|
||||
==================
|
||||
Memory Hot(Un)Plug
|
||||
==================
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _numa_memory_policy:
|
||||
|
||||
==================
|
||||
NUMA Memory Policy
|
||||
==================
|
||||
@@ -246,7 +244,7 @@ MPOL_INTERLEAVED
|
||||
interleaved system default policy works in this mode.
|
||||
|
||||
MPOL_PREFERRED_MANY
|
||||
This mode specifices that the allocation should be preferrably
|
||||
This mode specifies that the allocation should be preferably
|
||||
satisfied from the nodemask specified in the policy. If there is
|
||||
a memory pressure on all nodes in the nodemask, the allocation
|
||||
can fall back to all existing numa nodes. This is effectively
|
||||
@@ -360,7 +358,7 @@ and NUMA nodes. "Usage" here means one of the following:
|
||||
2) examination of the policy to determine the policy mode and associated node
|
||||
or node lists, if any, for page allocation. This is considered a "hot
|
||||
path". Note that for MPOL_BIND, the "usage" extends across the entire
|
||||
allocation process, which may sleep during page reclaimation, because the
|
||||
allocation process, which may sleep during page reclamation, because the
|
||||
BIND policy nodemask is used, by reference, to filter ineligible nodes.
|
||||
|
||||
We can avoid taking an extra reference during the usages listed above as
|
||||
|
@@ -1,6 +1,7 @@
|
||||
.. _numaperf:
|
||||
=======================
|
||||
NUMA Memory Performance
|
||||
=======================
|
||||
|
||||
=============
|
||||
NUMA Locality
|
||||
=============
|
||||
|
||||
@@ -61,7 +62,6 @@ that are CPUs and hence suitable for generic task scheduling, and
|
||||
IO initiators such as GPUs and NICs. Unlike access class 0, only
|
||||
nodes containing CPUs are considered.
|
||||
|
||||
================
|
||||
NUMA Performance
|
||||
================
|
||||
|
||||
@@ -96,7 +96,6 @@ for the platform.
|
||||
Access class 1 takes the same form but only includes values for CPU to
|
||||
memory activity.
|
||||
|
||||
==========
|
||||
NUMA Cache
|
||||
==========
|
||||
|
||||
@@ -170,7 +169,6 @@ The "size" is the number of bytes provided by this cache level.
|
||||
The "write_policy" will be 0 for write-back, and non-zero for
|
||||
write-through caching.
|
||||
|
||||
========
|
||||
See Also
|
||||
========
|
||||
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _pagemap:
|
||||
|
||||
=============================
|
||||
Examining Process Page Tables
|
||||
=============================
|
||||
@@ -19,10 +17,10 @@ There are four components to pagemap:
|
||||
* Bits 0-4 swap type if swapped
|
||||
* Bits 5-54 swap offset if swapped
|
||||
* Bit 55 pte is soft-dirty (see
|
||||
:ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
|
||||
Documentation/admin-guide/mm/soft-dirty.rst)
|
||||
* Bit 56 page exclusively mapped (since 4.2)
|
||||
* Bit 57 pte is uffd-wp write-protected (since 5.13) (see
|
||||
:ref:`Documentation/admin-guide/mm/userfaultfd.rst <userfaultfd>`)
|
||||
Documentation/admin-guide/mm/userfaultfd.rst)
|
||||
* Bits 58-60 zero
|
||||
* Bit 61 page is file-page or shared-anon (since 3.5)
|
||||
* Bit 62 page swapped
|
||||
@@ -46,7 +44,7 @@ There are four components to pagemap:
|
||||
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
|
||||
times each page is mapped, indexed by PFN.
|
||||
|
||||
The page-types tool in the tools/vm directory can be used to query the
|
||||
The page-types tool in the tools/mm directory can be used to query the
|
||||
number of times a page is mapped.
|
||||
|
||||
* ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
|
||||
@@ -105,8 +103,7 @@ Short descriptions to the page flags
|
||||
A compound page with order N consists of 2^N physically contiguous pages.
|
||||
A compound page with order 2 takes the form of "HTTT", where H donates its
|
||||
head page and T donates its tail page(s). The major consumers of compound
|
||||
pages are hugeTLB pages
|
||||
(:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`),
|
||||
pages are hugeTLB pages (Documentation/admin-guide/mm/hugetlbpage.rst),
|
||||
the SLUB etc. memory allocators and various device drivers.
|
||||
However in this interface, only huge/giga pages are made visible
|
||||
to end users.
|
||||
@@ -128,7 +125,7 @@ Short descriptions to the page flags
|
||||
Zero page for pfn_zero or huge_zero page.
|
||||
25 - IDLE
|
||||
The page has not been accessed since it was marked idle (see
|
||||
:ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
|
||||
Documentation/admin-guide/mm/idle_page_tracking.rst).
|
||||
Note that this flag may be stale in case the page was accessed via
|
||||
a PTE. To make sure the flag is up-to-date one has to read
|
||||
``/sys/kernel/mm/page_idle/bitmap`` first.
|
||||
@@ -173,7 +170,7 @@ LRU related page flags
|
||||
14 - SWAPBACKED
|
||||
The page is backed by swap/RAM.
|
||||
|
||||
The page-types tool in the tools/vm directory can be used to query the
|
||||
The page-types tool in the tools/mm directory can be used to query the
|
||||
above flags.
|
||||
|
||||
Using pagemap to do something useful
|
||||
|
@@ -1,5 +1,3 @@
|
||||
.. _shrinker_debugfs:
|
||||
|
||||
==========================
|
||||
Shrinker Debugfs Interface
|
||||
==========================
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user