mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 12:43:29 +02:00
Merge branch 'tbsdtv_linux_media/master' into tbsdtv_linux_media/latest
Conflicts: drivers/media/dvb-core/dvb_ca_en50221.c drivers/media/usb/cx231xx/Kconfig drivers/media/usb/cx231xx/cx231xx-cards.c drivers/media/usb/cx231xx/cx231xx-dvb.c drivers/media/usb/cx231xx/cx231xx.h
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -33,6 +33,7 @@
|
|||||||
*.lzo
|
*.lzo
|
||||||
*.patch
|
*.patch
|
||||||
*.gcno
|
*.gcno
|
||||||
|
*.ll
|
||||||
modules.builtin
|
modules.builtin
|
||||||
Module.symvers
|
Module.symvers
|
||||||
*.dwo
|
*.dwo
|
||||||
|
6
.mailmap
6
.mailmap
@@ -99,6 +99,8 @@ Linas Vepstas <linas@austin.ibm.com>
|
|||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
||||||
Mark Brown <broonie@sirena.org.uk>
|
Mark Brown <broonie@sirena.org.uk>
|
||||||
|
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
|
||||||
|
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
|
||||||
Matthieu CASTET <castet.matthieu@free.fr>
|
Matthieu CASTET <castet.matthieu@free.fr>
|
||||||
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
|
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
|
||||||
Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
|
Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
|
||||||
@@ -109,6 +111,7 @@ Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@osg.samsung.com>
|
|||||||
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@s-opensource.com>
|
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@s-opensource.com>
|
||||||
Matt Ranostay <mranostay@gmail.com> Matthew Ranostay <mranostay@embeddedalley.com>
|
Matt Ranostay <mranostay@gmail.com> Matthew Ranostay <mranostay@embeddedalley.com>
|
||||||
Matt Ranostay <mranostay@gmail.com> <matt.ranostay@intel.com>
|
Matt Ranostay <mranostay@gmail.com> <matt.ranostay@intel.com>
|
||||||
|
Matt Ranostay <matt.ranostay@konsulko.com> <matt@ranostay.consulting>
|
||||||
Mayuresh Janorkar <mayur@ti.com>
|
Mayuresh Janorkar <mayur@ti.com>
|
||||||
Michael Buesch <m@bues.ch>
|
Michael Buesch <m@bues.ch>
|
||||||
Michel Dänzer <michel@tungstengraphics.com>
|
Michel Dänzer <michel@tungstengraphics.com>
|
||||||
@@ -143,6 +146,8 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
|||||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||||
Sascha Hauer <s.hauer@pengutronix.de>
|
Sascha Hauer <s.hauer@pengutronix.de>
|
||||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||||
|
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||||
|
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||||
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
||||||
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
||||||
@@ -171,6 +176,7 @@ Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
|
|||||||
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
|
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
|
||||||
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
|
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
|
||||||
Takashi YOSHII <takashi.yoshii.zj@renesas.com>
|
Takashi YOSHII <takashi.yoshii.zj@renesas.com>
|
||||||
|
Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
|
||||||
Yusuke Goda <goda.yusuke@renesas.com>
|
Yusuke Goda <goda.yusuke@renesas.com>
|
||||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||||
Gustavo Padovan <padovan@profusion.mobi>
|
Gustavo Padovan <padovan@profusion.mobi>
|
||||||
|
8
CREDITS
8
CREDITS
@@ -1034,6 +1034,10 @@ S: 2037 Walnut #6
|
|||||||
S: Boulder, Colorado 80302
|
S: Boulder, Colorado 80302
|
||||||
S: USA
|
S: USA
|
||||||
|
|
||||||
|
N: Hans-Christian Noren Egtvedt
|
||||||
|
E: egtvedt@samfundet.no
|
||||||
|
D: AVR32 architecture maintainer.
|
||||||
|
|
||||||
N: Heiko Eißfeldt
|
N: Heiko Eißfeldt
|
||||||
E: heiko@colossus.escape.de heiko@unifix.de
|
E: heiko@colossus.escape.de heiko@unifix.de
|
||||||
D: verify_area stuff, generic SCSI fixes
|
D: verify_area stuff, generic SCSI fixes
|
||||||
@@ -3398,6 +3402,10 @@ S: Suite 101
|
|||||||
S: Markham, Ontario L3R 2Z6
|
S: Markham, Ontario L3R 2Z6
|
||||||
S: Canada
|
S: Canada
|
||||||
|
|
||||||
|
N: Haavard Skinnemoen
|
||||||
|
M: Haavard Skinnemoen <hskinnemoen@gmail.com>
|
||||||
|
D: AVR32 architecture port to Linux and maintainer.
|
||||||
|
|
||||||
N: Rick Sladkey
|
N: Rick Sladkey
|
||||||
E: jrs@world.std.com
|
E: jrs@world.std.com
|
||||||
D: utility hacker: Emacs, NFS server, mount, kmem-ps, UPS debugger, strace, GDB
|
D: utility hacker: Emacs, NFS server, mount, kmem-ps, UPS debugger, strace, GDB
|
||||||
|
@@ -412,6 +412,8 @@ sysctl/
|
|||||||
- directory with info on the /proc/sys/* files.
|
- directory with info on the /proc/sys/* files.
|
||||||
target/
|
target/
|
||||||
- directory with info on generating TCM v4 fabric .ko modules
|
- directory with info on generating TCM v4 fabric .ko modules
|
||||||
|
tee.txt
|
||||||
|
- info on the TEE subsystem and drivers
|
||||||
this_cpu_ops.txt
|
this_cpu_ops.txt
|
||||||
- List rationale behind and the way to use this_cpu operations.
|
- List rationale behind and the way to use this_cpu operations.
|
||||||
thermal/
|
thermal/
|
||||||
|
8
Documentation/ABI/obsolete/sysfs-firmware-acpi
Normal file
8
Documentation/ABI/obsolete/sysfs-firmware-acpi
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
What: /sys/firmware/acpi/hotplug/force_remove
|
||||||
|
Date: Mar 2017
|
||||||
|
Contact: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
Description:
|
||||||
|
Since the force_remove is inherently broken and dangerous to
|
||||||
|
use for some hotplugable resources like memory (because ignoring
|
||||||
|
the offline failure might lead to memory corruption and crashes)
|
||||||
|
enabling this knob is not safe and thus unsupported.
|
@@ -9,7 +9,7 @@ Description:
|
|||||||
hubs this facility is always enabled and their device
|
hubs this facility is always enabled and their device
|
||||||
directories will not contain this file.
|
directories will not contain this file.
|
||||||
|
|
||||||
For more information, see Documentation/usb/persist.txt.
|
For more information, see Documentation/driver-api/usb/persist.rst.
|
||||||
|
|
||||||
What: /sys/bus/usb/devices/.../power/autosuspend
|
What: /sys/bus/usb/devices/.../power/autosuspend
|
||||||
Date: March 2007
|
Date: March 2007
|
||||||
|
@@ -16,7 +16,8 @@ The vDSO uses symbol versioning; whenever you request a symbol from the
|
|||||||
vDSO, specify the version you are expecting.
|
vDSO, specify the version you are expecting.
|
||||||
|
|
||||||
Programs that dynamically link to glibc will use the vDSO automatically.
|
Programs that dynamically link to glibc will use the vDSO automatically.
|
||||||
Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
|
Otherwise, you can use the reference parser in
|
||||||
|
tools/testing/selftests/vDSO/parse_vdso.c.
|
||||||
|
|
||||||
Unless otherwise noted, the set of symbols with any given version and the
|
Unless otherwise noted, the set of symbols with any given version and the
|
||||||
ABI of those symbols is considered stable. It may vary across architectures,
|
ABI of those symbols is considered stable. It may vary across architectures,
|
||||||
|
@@ -213,14 +213,8 @@ What: /sys/block/<disk>/queue/discard_zeroes_data
|
|||||||
Date: May 2011
|
Date: May 2011
|
||||||
Contact: Martin K. Petersen <martin.petersen@oracle.com>
|
Contact: Martin K. Petersen <martin.petersen@oracle.com>
|
||||||
Description:
|
Description:
|
||||||
Devices that support discard functionality may return
|
Will always return 0. Don't rely on any specific behavior
|
||||||
stale or random data when a previously discarded block
|
for discards, and don't read this file.
|
||||||
is read back. This can cause problems if the filesystem
|
|
||||||
expects discarded blocks to be explicitly cleared. If a
|
|
||||||
device reports that it deterministically returns zeroes
|
|
||||||
when a discarded area is read the discard_zeroes_data
|
|
||||||
parameter will be set to one. Otherwise it will be 0 and
|
|
||||||
the result of reading a discarded area is undefined.
|
|
||||||
|
|
||||||
What: /sys/block/<disk>/queue/write_same_max_bytes
|
What: /sys/block/<disk>/queue/write_same_max_bytes
|
||||||
Date: January 2012
|
Date: January 2012
|
||||||
|
@@ -55,6 +55,7 @@ Description:
|
|||||||
then it is to be found in the base device directory.
|
then it is to be found in the base device directory.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/sampling_frequency_available
|
What: /sys/bus/iio/devices/iio:deviceX/sampling_frequency_available
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_proximity_sampling_frequency_available
|
||||||
What: /sys/.../iio:deviceX/buffer/sampling_frequency_available
|
What: /sys/.../iio:deviceX/buffer/sampling_frequency_available
|
||||||
What: /sys/bus/iio/devices/triggerX/sampling_frequency_available
|
What: /sys/bus/iio/devices/triggerX/sampling_frequency_available
|
||||||
KernelVersion: 2.6.35
|
KernelVersion: 2.6.35
|
||||||
@@ -1593,7 +1594,7 @@ Description:
|
|||||||
can be processed to siemens per meter.
|
can be processed to siemens per meter.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Raw counter device counts from channel Y. For quadrature
|
Raw counter device counts from channel Y. For quadrature
|
||||||
@@ -1601,10 +1602,24 @@ Description:
|
|||||||
the counts of a single quadrature signal phase from channel Y.
|
the counts of a single quadrature signal phase from channel Y.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Raw counter device index value from channel Y. This attribute
|
Raw counter device index value from channel Y. This attribute
|
||||||
provides an absolute positional reference (e.g. a pulse once per
|
provides an absolute positional reference (e.g. a pulse once per
|
||||||
revolution) which may be used to home positional systems as
|
revolution) which may be used to home positional systems as
|
||||||
required.
|
required.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
A list of possible counting directions which are:
|
||||||
|
- "up" : counter device is increasing.
|
||||||
|
- "down": counter device is decreasing.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Raw counter device counters direction for channel Y.
|
||||||
|
17
Documentation/ABI/testing/sysfs-bus-iio-adc-max9611
Normal file
17
Documentation/ABI/testing/sysfs-bus-iio-adc-max9611
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_power_shunt_resistor
|
||||||
|
Date: March 2017
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description: The value of the shunt resistor used to compute power drain on
|
||||||
|
common input voltage pin (RS+). In Ohms.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_current_shunt_resistor
|
||||||
|
Date: March 2017
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description: The value of the shunt resistor used to compute current flowing
|
||||||
|
between RS+ and RS- voltage sense inputs. In Ohms.
|
||||||
|
|
||||||
|
These attributes describe a single physical component, exposed as two distinct
|
||||||
|
attributes as it is used to calculate two different values: power load and
|
||||||
|
current flowing between RS+ and RS- inputs.
|
@@ -1,24 +1,16 @@
|
|||||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_count_mode_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_count_mode_available
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_noise_error_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_noise_error_available
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_index_index_polarity_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_index_index_polarity_available
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Discrete set of available values for the respective counter
|
Discrete set of available values for the respective counter
|
||||||
configuration are listed in this file.
|
configuration are listed in this file.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
|
|
||||||
KernelVersion: 4.9
|
|
||||||
Contact: linux-iio@vger.kernel.org
|
|
||||||
Description:
|
|
||||||
Read-only attribute that indicates whether the counter for
|
|
||||||
channel Y is counting up or down.
|
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Count mode for channel Y. Four count modes are available:
|
Count mode for channel Y. Four count modes are available:
|
||||||
@@ -52,7 +44,7 @@ Description:
|
|||||||
continuously throughout.
|
continuously throughout.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Read-only attribute that indicates whether excessive noise is
|
Read-only attribute that indicates whether excessive noise is
|
||||||
@@ -60,14 +52,14 @@ Description:
|
|||||||
irrelevant in non-quadrature clock mode.
|
irrelevant in non-quadrature clock mode.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
If the counter device supports preset registers, the preset
|
If the counter device supports preset registers, the preset
|
||||||
count for channel Y is provided by this attribute.
|
count for channel Y is provided by this attribute.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Configure channel Y counter for non-quadrature or quadrature
|
Configure channel Y counter for non-quadrature or quadrature
|
||||||
@@ -88,7 +80,7 @@ Description:
|
|||||||
decoded for UP/DN clock.
|
decoded for UP/DN clock.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
|
What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Whether to set channel Y counter with channel Y preset value
|
Whether to set channel Y counter with channel Y preset value
|
||||||
@@ -96,14 +88,14 @@ Description:
|
|||||||
Valid attribute values are boolean.
|
Valid attribute values are boolean.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
|
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Active level of channel Y index input; irrelevant in
|
Active level of channel Y index input; irrelevant in
|
||||||
non-synchronous load mode.
|
non-synchronous load mode.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
|
What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
|
||||||
KernelVersion: 4.9
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Configure channel Y counter for non-synchronous or synchronous
|
Configure channel Y counter for non-synchronous or synchronous
|
||||||
|
@@ -3,11 +3,15 @@ KernelVersion: 4.11
|
|||||||
Contact: benjamin.gaignard@st.com
|
Contact: benjamin.gaignard@st.com
|
||||||
Description:
|
Description:
|
||||||
Reading returns the list possible master modes which are:
|
Reading returns the list possible master modes which are:
|
||||||
- "reset" : The UG bit from the TIMx_EGR register is used as trigger output (TRGO).
|
- "reset" : The UG bit from the TIMx_EGR register is
|
||||||
- "enable" : The Counter Enable signal CNT_EN is used as trigger output.
|
used as trigger output (TRGO).
|
||||||
|
- "enable" : The Counter Enable signal CNT_EN is used
|
||||||
|
as trigger output.
|
||||||
- "update" : The update event is selected as trigger output.
|
- "update" : The update event is selected as trigger output.
|
||||||
For instance a master timer can then be used as a prescaler for a slave timer.
|
For instance a master timer can then be used
|
||||||
- "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set.
|
as a prescaler for a slave timer.
|
||||||
|
- "compare_pulse" : The trigger output send a positive pulse
|
||||||
|
when the CC1IF flag is to be set.
|
||||||
- "OC1REF" : OC1REF signal is used as trigger output.
|
- "OC1REF" : OC1REF signal is used as trigger output.
|
||||||
- "OC2REF" : OC2REF signal is used as trigger output.
|
- "OC2REF" : OC2REF signal is used as trigger output.
|
||||||
- "OC3REF" : OC3REF signal is used as trigger output.
|
- "OC3REF" : OC3REF signal is used as trigger output.
|
||||||
@@ -27,3 +31,62 @@ Description:
|
|||||||
Reading returns the current sampling frequency.
|
Reading returns the current sampling frequency.
|
||||||
Writing an value different of 0 set and start sampling.
|
Writing an value different of 0 set and start sampling.
|
||||||
Writing 0 stop sampling.
|
Writing 0 stop sampling.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count0_preset
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: benjamin.gaignard@st.com
|
||||||
|
Description:
|
||||||
|
Reading returns the current preset value.
|
||||||
|
Writing sets the preset value.
|
||||||
|
When counting up the counter starts from 0 and fires an
|
||||||
|
event when reach preset value.
|
||||||
|
When counting down the counter start from preset value
|
||||||
|
and fire event when reach 0.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: benjamin.gaignard@st.com
|
||||||
|
Description:
|
||||||
|
Reading returns the list possible quadrature modes.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count0_quadrature_mode
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: benjamin.gaignard@st.com
|
||||||
|
Description:
|
||||||
|
Configure the device counter quadrature modes:
|
||||||
|
channel_A:
|
||||||
|
Encoder A input servers as the count input and B as
|
||||||
|
the UP/DOWN direction control input.
|
||||||
|
|
||||||
|
channel_B:
|
||||||
|
Encoder B input serves as the count input and A as
|
||||||
|
the UP/DOWN direction control input.
|
||||||
|
|
||||||
|
quadrature:
|
||||||
|
Encoder A and B inputs are mixed to get direction
|
||||||
|
and count with a scale of 0.25.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count_enable_mode_available
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: benjamin.gaignard@st.com
|
||||||
|
Description:
|
||||||
|
Reading returns the list possible enable modes.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_count0_enable_mode
|
||||||
|
KernelVersion: 4.12
|
||||||
|
Contact: benjamin.gaignard@st.com
|
||||||
|
Description:
|
||||||
|
Configure the device counter enable modes, in all case
|
||||||
|
counting direction is set by in_count0_count_direction
|
||||||
|
attribute and the counter is clocked by the internal clock.
|
||||||
|
always:
|
||||||
|
Counter is always ON.
|
||||||
|
|
||||||
|
gated:
|
||||||
|
Counting is enabled when connected trigger signal
|
||||||
|
level is high else counting is disabled.
|
||||||
|
|
||||||
|
triggered:
|
||||||
|
Counting is enabled on rising edge of the connected
|
||||||
|
trigger, and remains enabled for the duration of this
|
||||||
|
selected mode.
|
||||||
|
@@ -299,5 +299,27 @@ What: /sys/bus/pci/devices/.../revision
|
|||||||
Date: November 2016
|
Date: November 2016
|
||||||
Contact: Emil Velikov <emil.l.velikov@gmail.com>
|
Contact: Emil Velikov <emil.l.velikov@gmail.com>
|
||||||
Description:
|
Description:
|
||||||
This file contains the revision field of the the PCI device.
|
This file contains the revision field of the PCI device.
|
||||||
The value comes from device config space. The file is read only.
|
The value comes from device config space. The file is read only.
|
||||||
|
|
||||||
|
What: /sys/bus/pci/devices/.../sriov_drivers_autoprobe
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Bodong Wang<bodong@mellanox.com>
|
||||||
|
Description:
|
||||||
|
This file is associated with the PF of a device that
|
||||||
|
supports SR-IOV. It determines whether newly-enabled VFs
|
||||||
|
are immediately bound to a driver. It initially contains
|
||||||
|
1, which means the kernel automatically binds VFs to a
|
||||||
|
compatible driver immediately after they are enabled. If
|
||||||
|
an application writes 0 to the file before enabling VFs,
|
||||||
|
the kernel will not bind VFs to a driver.
|
||||||
|
|
||||||
|
A typical use case is to write 0 to this file, then enable
|
||||||
|
VFs, then assign the newly-created VFs to virtual machines.
|
||||||
|
Note that changing this file does not affect already-
|
||||||
|
enabled VFs. In this scenario, the user must first disable
|
||||||
|
the VFs, write 0 to sriov_drivers_autoprobe, then re-enable
|
||||||
|
the VFs.
|
||||||
|
|
||||||
|
This is similar to /sys/bus/pci/drivers_autoprobe, but
|
||||||
|
affects only the VFs associated with a specific PF.
|
||||||
|
@@ -21,3 +21,30 @@ Description:
|
|||||||
is responsible for coordination of driver and firmware
|
is responsible for coordination of driver and firmware
|
||||||
link framing mode, changing this setting to 'Y' if the
|
link framing mode, changing this setting to 'Y' if the
|
||||||
firmware is configured for 'raw-ip' mode.
|
firmware is configured for 'raw-ip' mode.
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/qmi/add_mux
|
||||||
|
Date: March 2017
|
||||||
|
KernelVersion: 4.11
|
||||||
|
Contact: Bjørn Mork <bjorn@mork.no>
|
||||||
|
Description:
|
||||||
|
Unsigned integer.
|
||||||
|
|
||||||
|
Write a number ranging from 1 to 127 to add a qmap mux
|
||||||
|
based network device, supported by recent Qualcomm based
|
||||||
|
modems.
|
||||||
|
|
||||||
|
The network device will be called qmimux.
|
||||||
|
|
||||||
|
Userspace is in charge of managing the qmux network device
|
||||||
|
activation and data stream setup on the modem side by
|
||||||
|
using the proper QMI protocol requests.
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/qmi/del_mux
|
||||||
|
Date: March 2017
|
||||||
|
KernelVersion: 4.11
|
||||||
|
Contact: Bjørn Mork <bjorn@mork.no>
|
||||||
|
Description:
|
||||||
|
Unsigned integer.
|
||||||
|
|
||||||
|
Write a number ranging from 1 to 127 to delete a previously
|
||||||
|
created qmap mux based network device.
|
||||||
|
96
Documentation/ABI/testing/sysfs-class-switchtec
Normal file
96
Documentation/ABI/testing/sysfs-class-switchtec
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
switchtec - Microsemi Switchtec PCI Switch Management Endpoint
|
||||||
|
|
||||||
|
For details on this subsystem look at Documentation/switchtec.txt.
|
||||||
|
|
||||||
|
What: /sys/class/switchtec
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: The switchtec class subsystem folder.
|
||||||
|
Each registered switchtec driver is represented by a switchtecX
|
||||||
|
subfolder (X being an integer >= 0).
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/component_id
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Component identifier as stored in the hardware (eg. PM8543)
|
||||||
|
(read only)
|
||||||
|
Values: arbitrary string.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/component_revision
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Component revision stored in the hardware (read only)
|
||||||
|
Values: integer.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/component_vendor
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Component vendor as stored in the hardware (eg. MICROSEM)
|
||||||
|
(read only)
|
||||||
|
Values: arbitrary string.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/device_version
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Device version as stored in the hardware (read only)
|
||||||
|
Values: integer.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/fw_version
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Currently running firmware version (read only)
|
||||||
|
Values: integer (in hexadecimal).
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/partition
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Partition number for this device in the switch (read only)
|
||||||
|
Values: integer.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/partition_count
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Total number of partitions in the switch (read only)
|
||||||
|
Values: integer.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/product_id
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Product identifier as stored in the hardware (eg. PSX 48XG3)
|
||||||
|
(read only)
|
||||||
|
Values: arbitrary string.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/product_revision
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Product revision stored in the hardware (eg. RevB)
|
||||||
|
(read only)
|
||||||
|
Values: arbitrary string.
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/class/switchtec/switchtec[0-9]+/product_vendor
|
||||||
|
Date: 05-Jan-2017
|
||||||
|
KernelVersion: v4.11
|
||||||
|
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||||
|
Description: Product vendor as stored in the hardware (eg. MICROSEM)
|
||||||
|
(read only)
|
||||||
|
Values: arbitrary string.
|
276
Documentation/ABI/testing/sysfs-class-typec
Normal file
276
Documentation/ABI/testing/sysfs-class-typec
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
USB Type-C port devices (eg. /sys/class/typec/port0/)
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/data_role
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
The supported USB data roles. This attribute can be used for
|
||||||
|
requesting data role swapping on the port. Swapping is supported
|
||||||
|
as synchronous operation, so write(2) to the attribute will not
|
||||||
|
return until the operation has finished. The attribute is
|
||||||
|
notified about role changes so that poll(2) on the attribute
|
||||||
|
wakes up. Change on the role will also generate uevent
|
||||||
|
KOBJ_CHANGE on the port. The current role is show in brackets,
|
||||||
|
for example "[host] device" when DRP port is in host mode.
|
||||||
|
|
||||||
|
Valid values: host, device
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/power_role
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
The supported power roles. This attribute can be used to request
|
||||||
|
power role swap on the port when the port supports USB Power
|
||||||
|
Delivery. Swapping is supported as synchronous operation, so
|
||||||
|
write(2) to the attribute will not return until the operation
|
||||||
|
has finished. The attribute is notified about role changes so
|
||||||
|
that poll(2) on the attribute wakes up. Change on the role will
|
||||||
|
also generate uevent KOBJ_CHANGE. The current role is show in
|
||||||
|
brackets, for example "[source] sink" when in source mode.
|
||||||
|
|
||||||
|
Valid values: source, sink
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/vconn_source
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows is the port VCONN Source. This attribute can be used to
|
||||||
|
request VCONN swap to change the VCONN Source during connection
|
||||||
|
when both the port and the partner support USB Power Delivery.
|
||||||
|
Swapping is supported as synchronous operation, so write(2) to
|
||||||
|
the attribute will not return until the operation has finished.
|
||||||
|
The attribute is notified about VCONN source changes so that
|
||||||
|
poll(2) on the attribute wakes up. Change on VCONN source also
|
||||||
|
generates uevent KOBJ_CHANGE.
|
||||||
|
|
||||||
|
Valid values:
|
||||||
|
- "no" when the port is not the VCONN Source
|
||||||
|
- "yes" when the port is the VCONN Source
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/power_operation_mode
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows the current power operational mode the port is in. The
|
||||||
|
power operation mode means current level for VBUS. In case USB
|
||||||
|
Power Delivery communication is used for negotiating the levels,
|
||||||
|
power operation mode should show "usb_power_delivery".
|
||||||
|
|
||||||
|
Valid values:
|
||||||
|
- default
|
||||||
|
- 1.5A
|
||||||
|
- 3.0A
|
||||||
|
- usb_power_delivery
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/preferred_role
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
The user space can notify the driver about the preferred role.
|
||||||
|
It should be handled as enabling of Try.SRC or Try.SNK, as
|
||||||
|
defined in USB Type-C specification, in the port drivers. By
|
||||||
|
default the preferred role should come from the platform.
|
||||||
|
|
||||||
|
Valid values: source, sink, none (to remove preference)
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/supported_accessory_modes
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Space separated list of accessory modes, defined in the USB
|
||||||
|
Type-C specification, the port supports.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/usb_power_delivery_revision
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Revision number of the supported USB Power Delivery
|
||||||
|
specification, or 0 when USB Power Delivery is not supported.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/usb_typec_revision
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Revision number of the supported USB Type-C specification.
|
||||||
|
|
||||||
|
|
||||||
|
USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner/accessory_mode
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows the Accessory Mode name when the partner is an Accessory.
|
||||||
|
The Accessory Modes are defined in USB Type-C Specification.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner/supports_usb_power_delivery
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows if the partner supports USB Power Delivery communication:
|
||||||
|
Valid values: yes, no
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner>/identity/
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
This directory appears only if the port device driver is capable
|
||||||
|
of showing the result of Discover Identity USB power delivery
|
||||||
|
command. That will not always be possible even when USB power
|
||||||
|
delivery is supported, for example when USB power delivery
|
||||||
|
communication for the port is mostly handled in firmware. If the
|
||||||
|
directory exists, it will have an attribute file for every VDO
|
||||||
|
in Discover Identity command result.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner/identity/id_header
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
ID Header VDO part of Discover Identity command result. The
|
||||||
|
value will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner/identity/cert_stat
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Cert Stat VDO part of Discover Identity command result. The
|
||||||
|
value will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-partner/identity/product
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Product VDO part of Discover Identity command result. The value
|
||||||
|
will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
|
||||||
|
USB Type-C cable devices (eg. /sys/class/typec/port0-cable/)
|
||||||
|
|
||||||
|
Note: Electronically Marked Cables will have a device also for one cable plug
|
||||||
|
(eg. /sys/class/typec/port0-plug0). If the cable is active and has also SOP
|
||||||
|
Double Prime controller (USB Power Deliver specification ch. 2.4) it will have
|
||||||
|
second device also for the other plug. Both plugs may have alternate modes as
|
||||||
|
described in USB Type-C and USB Power Delivery specifications.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/type
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows if the cable is active.
|
||||||
|
Valid values: active, passive
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/plug_type
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows type of the plug on the cable:
|
||||||
|
- type-a - Standard A
|
||||||
|
- type-b - Standard B
|
||||||
|
- type-c
|
||||||
|
- captive
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/identity/
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
This directory appears only if the port device driver is capable
|
||||||
|
of showing the result of Discover Identity USB power delivery
|
||||||
|
command. That will not always be possible even when USB power
|
||||||
|
delivery is supported. If the directory exists, it will have an
|
||||||
|
attribute for every VDO returned by Discover Identity command.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/identity/id_header
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
ID Header VDO part of Discover Identity command result. The
|
||||||
|
value will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/identity/cert_stat
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Cert Stat VDO part of Discover Identity command result. The
|
||||||
|
value will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>-cable/identity/product
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Product VDO part of Discover Identity command result. The value
|
||||||
|
will show 0 until Discover Identity command result becomes
|
||||||
|
available. The value can be polled.
|
||||||
|
|
||||||
|
|
||||||
|
Alternate Mode devices.
|
||||||
|
|
||||||
|
The alternate modes will have Standard or Vendor ID (SVID) assigned by USB-IF.
|
||||||
|
The ports, partners and cable plugs can have alternate modes. A supported SVID
|
||||||
|
will consist of a set of modes. Every SVID a port/partner/plug supports will
|
||||||
|
have a device created for it, and every supported mode for a supported SVID will
|
||||||
|
have its own directory under that device. Below <dev> refers to the device for
|
||||||
|
the alternate mode.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port|partner|cable>/<dev>/svid
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
The SVID (Standard or Vendor ID) assigned by USB-IF for this
|
||||||
|
alternate mode.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port|partner|cable>/<dev>/mode<index>/
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Every supported mode will have its own directory. The name of
|
||||||
|
a mode will be "mode<index>" (for example mode1), where <index>
|
||||||
|
is the actual index to the mode VDO returned by Discover Modes
|
||||||
|
USB power delivery command.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port|partner|cable>/<dev>/mode<index>/description
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows description of the mode. The description is optional for
|
||||||
|
the drivers, just like with the Billboard Devices.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port|partner|cable>/<dev>/mode<index>/vdo
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows the VDO in hexadecimal returned by Discover Modes command
|
||||||
|
for this mode.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port|partner|cable>/<dev>/mode<index>/active
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Shows if the mode is active or not. The attribute can be used
|
||||||
|
for entering/exiting the mode with partners and cable plugs, and
|
||||||
|
with the port alternate modes it can be used for disabling
|
||||||
|
support for specific alternate modes. Entering/exiting modes is
|
||||||
|
supported as synchronous operation so write(2) to the attribute
|
||||||
|
does not return until the enter/exit mode operation has
|
||||||
|
finished. The attribute is notified when the mode is
|
||||||
|
entered/exited so poll(2) on the attribute wakes up.
|
||||||
|
Entering/exiting a mode will also generate uevent KOBJ_CHANGE.
|
||||||
|
|
||||||
|
Valid values: yes, no
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/<dev>/mode<index>/supported_roles
|
||||||
|
Date: April 2017
|
||||||
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Space separated list of the supported roles.
|
||||||
|
|
||||||
|
This attribute is available for the devices describing the
|
||||||
|
alternate modes a port supports, and it will not be exposed with
|
||||||
|
the devices presenting the alternate modes the partners or cable
|
||||||
|
plugs support.
|
||||||
|
|
||||||
|
Valid values: source, sink
|
@@ -366,3 +366,10 @@ Contact: Linux ARM Kernel Mailing list <linux-arm-kernel@lists.infradead.org>
|
|||||||
Description: AArch64 CPU registers
|
Description: AArch64 CPU registers
|
||||||
'identification' directory exposes the CPU ID registers for
|
'identification' directory exposes the CPU ID registers for
|
||||||
identifying model and revision of the CPU.
|
identifying model and revision of the CPU.
|
||||||
|
|
||||||
|
What: /sys/devices/system/cpu/cpu#/cpu_capacity
|
||||||
|
Date: December 2016
|
||||||
|
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||||
|
Description: information about CPUs heterogeneity.
|
||||||
|
|
||||||
|
cpu_capacity: capacity of cpu#.
|
||||||
|
@@ -44,16 +44,6 @@ Description:
|
|||||||
or 0 (unset). Attempts to write any other values to it will
|
or 0 (unset). Attempts to write any other values to it will
|
||||||
cause -EINVAL to be returned.
|
cause -EINVAL to be returned.
|
||||||
|
|
||||||
What: /sys/firmware/acpi/hotplug/force_remove
|
|
||||||
Date: May 2013
|
|
||||||
Contact: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
Description:
|
|
||||||
The number in this file (0 or 1) determines whether (1) or not
|
|
||||||
(0) the ACPI subsystem will allow devices to be hot-removed even
|
|
||||||
if they cannot be put offline gracefully (from the kernel's
|
|
||||||
viewpoint). That number can be changed by writing a boolean
|
|
||||||
value to this file.
|
|
||||||
|
|
||||||
What: /sys/firmware/acpi/interrupts/
|
What: /sys/firmware/acpi/interrupts/
|
||||||
Date: February 2008
|
Date: February 2008
|
||||||
Contact: Len Brown <lenb@kernel.org>
|
Contact: Len Brown <lenb@kernel.org>
|
||||||
|
@@ -25,6 +25,14 @@ Description:
|
|||||||
code is currently applied. Writing 0 will disable the patch
|
code is currently applied. Writing 0 will disable the patch
|
||||||
while writing 1 will re-enable the patch.
|
while writing 1 will re-enable the patch.
|
||||||
|
|
||||||
|
What: /sys/kernel/livepatch/<patch>/transition
|
||||||
|
Date: Feb 2017
|
||||||
|
KernelVersion: 4.12.0
|
||||||
|
Contact: live-patching@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
An attribute which indicates whether the patch is currently in
|
||||||
|
transition.
|
||||||
|
|
||||||
What: /sys/kernel/livepatch/<patch>/<object>
|
What: /sys/kernel/livepatch/<patch>/<object>
|
||||||
Date: Nov 2014
|
Date: Nov 2014
|
||||||
KernelVersion: 3.19.0
|
KernelVersion: 3.19.0
|
||||||
|
9
Documentation/ABI/testing/sysfs-platform-chipidea-usb2
Normal file
9
Documentation/ABI/testing/sysfs-platform-chipidea-usb2
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
What: /sys/bus/platform/devices/ci_hdrc.0/role
|
||||||
|
Date: Mar 2017
|
||||||
|
Contact: Peter Chen <peter.chen@nxp.com>
|
||||||
|
Description:
|
||||||
|
It returns string "gadget" or "host" when read it, it indicates
|
||||||
|
current controller role.
|
||||||
|
|
||||||
|
It will do role switch when write "gadget" or "host" to it.
|
||||||
|
Only controller at dual-role configuration supports writing.
|
15
Documentation/ABI/testing/sysfs-platform-renesas_usb3
Normal file
15
Documentation/ABI/testing/sysfs-platform-renesas_usb3
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
What: /sys/devices/platform/<renesas_usb3's name>/role
|
||||||
|
Date: March 2017
|
||||||
|
KernelVersion: 4.13
|
||||||
|
Contact: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
|
||||||
|
Description:
|
||||||
|
This file can be read and write.
|
||||||
|
The file can show/change the drd mode of usb.
|
||||||
|
|
||||||
|
Write the following string to change the mode:
|
||||||
|
"host" - switching mode from peripheral to host.
|
||||||
|
"peripheral" - switching mode from host to peripheral.
|
||||||
|
|
||||||
|
Read the file, then it shows the following strings:
|
||||||
|
"host" - The mode is host now.
|
||||||
|
"peripheral" - The mode is peripheral now.
|
@@ -8,12 +8,11 @@
|
|||||||
|
|
||||||
DOCBOOKS := z8530book.xml \
|
DOCBOOKS := z8530book.xml \
|
||||||
kernel-hacking.xml kernel-locking.xml \
|
kernel-hacking.xml kernel-locking.xml \
|
||||||
writing_usb_driver.xml networking.xml \
|
networking.xml \
|
||||||
kernel-api.xml filesystems.xml lsm.xml kgdb.xml \
|
filesystems.xml lsm.xml kgdb.xml \
|
||||||
gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
|
libata.xml mtdnand.xml librs.xml rapidio.xml \
|
||||||
genericirq.xml s390-drivers.xml scsi.xml \
|
s390-drivers.xml scsi.xml \
|
||||||
sh.xml w1.xml \
|
sh.xml w1.xml
|
||||||
writing_musb_glue_layer.xml
|
|
||||||
|
|
||||||
ifeq ($(DOCBOOKS),)
|
ifeq ($(DOCBOOKS),)
|
||||||
|
|
||||||
@@ -62,11 +61,14 @@ MAN := $(patsubst %.xml, %.9, $(BOOKS))
|
|||||||
mandocs: $(MAN)
|
mandocs: $(MAN)
|
||||||
find $(obj)/man -name '*.9' | xargs gzip -nf
|
find $(obj)/man -name '*.9' | xargs gzip -nf
|
||||||
|
|
||||||
|
# Default location for installed man pages
|
||||||
|
export INSTALL_MAN_PATH = $(objtree)/usr
|
||||||
|
|
||||||
installmandocs: mandocs
|
installmandocs: mandocs
|
||||||
mkdir -p /usr/local/man/man9/
|
mkdir -p $(INSTALL_MAN_PATH)/man/man9/
|
||||||
find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
|
find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
|
||||||
sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
|
sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
|
||||||
xargs install -m 644 -t /usr/local/man/man9/
|
xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/
|
||||||
|
|
||||||
# no-op for the DocBook toolchain
|
# no-op for the DocBook toolchain
|
||||||
epubdocs:
|
epubdocs:
|
||||||
@@ -238,7 +240,9 @@ dochelp:
|
|||||||
@echo ' psdocs - Postscript'
|
@echo ' psdocs - Postscript'
|
||||||
@echo ' xmldocs - XML DocBook'
|
@echo ' xmldocs - XML DocBook'
|
||||||
@echo ' mandocs - man pages'
|
@echo ' mandocs - man pages'
|
||||||
@echo ' installmandocs - install man pages generated by mandocs'
|
@echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
|
||||||
|
echo ' (default: $(INSTALL_MAN_PATH))'; \
|
||||||
|
echo ''
|
||||||
@echo ' cleandocs - clean all generated DocBook files'
|
@echo ' cleandocs - clean all generated DocBook files'
|
||||||
@echo
|
@echo
|
||||||
@echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
|
@echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
|
||||||
|
@@ -1,793 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
|
||||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
|
||||||
|
|
||||||
<book id="USB-Gadget-API">
|
|
||||||
<bookinfo>
|
|
||||||
<title>USB Gadget API for Linux</title>
|
|
||||||
<date>20 August 2004</date>
|
|
||||||
<edition>20 August 2004</edition>
|
|
||||||
|
|
||||||
<legalnotice>
|
|
||||||
<para>
|
|
||||||
This documentation is free software; you can redistribute
|
|
||||||
it and/or modify it under the terms of the GNU General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2 of the License, or (at your option) any later
|
|
||||||
version.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This program is distributed in the hope that it will be
|
|
||||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
||||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
||||||
See the GNU General Public License for more details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
You should have received a copy of the GNU General Public
|
|
||||||
License along with this program; if not, write to the Free
|
|
||||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
||||||
MA 02111-1307 USA
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
For more details see the file COPYING in the source
|
|
||||||
distribution of Linux.
|
|
||||||
</para>
|
|
||||||
</legalnotice>
|
|
||||||
<copyright>
|
|
||||||
<year>2003-2004</year>
|
|
||||||
<holder>David Brownell</holder>
|
|
||||||
</copyright>
|
|
||||||
|
|
||||||
<author>
|
|
||||||
<firstname>David</firstname>
|
|
||||||
<surname>Brownell</surname>
|
|
||||||
<affiliation>
|
|
||||||
<address><email>dbrownell@users.sourceforge.net</email></address>
|
|
||||||
</affiliation>
|
|
||||||
</author>
|
|
||||||
</bookinfo>
|
|
||||||
|
|
||||||
<toc></toc>
|
|
||||||
|
|
||||||
<chapter id="intro"><title>Introduction</title>
|
|
||||||
|
|
||||||
<para>This document presents a Linux-USB "Gadget"
|
|
||||||
kernel mode
|
|
||||||
API, for use within peripherals and other USB devices
|
|
||||||
that embed Linux.
|
|
||||||
It provides an overview of the API structure,
|
|
||||||
and shows how that fits into a system development project.
|
|
||||||
This is the first such API released on Linux to address
|
|
||||||
a number of important problems, including: </para>
|
|
||||||
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>Supports USB 2.0, for high speed devices which
|
|
||||||
can stream data at several dozen megabytes per second.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Handles devices with dozens of endpoints just as
|
|
||||||
well as ones with just two fixed-function ones. Gadget drivers
|
|
||||||
can be written so they're easy to port to new hardware.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Flexible enough to expose more complex USB device
|
|
||||||
capabilities such as multiple configurations, multiple interfaces,
|
|
||||||
composite devices,
|
|
||||||
and alternate interface settings.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>USB "On-The-Go" (OTG) support, in conjunction
|
|
||||||
with updates to the Linux-USB host side.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Sharing data structures and API models with the
|
|
||||||
Linux-USB host side API. This helps the OTG support, and
|
|
||||||
looks forward to more-symmetric frameworks (where the same
|
|
||||||
I/O model is used by both host and device side drivers).
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Minimalist, so it's easier to support new device
|
|
||||||
controller hardware. I/O processing doesn't imply large
|
|
||||||
demands for memory or CPU resources.
|
|
||||||
</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
|
|
||||||
|
|
||||||
<para>Most Linux developers will not be able to use this API, since they
|
|
||||||
have USB "host" hardware in a PC, workstation, or server.
|
|
||||||
Linux users with embedded systems are more likely to
|
|
||||||
have USB peripheral hardware.
|
|
||||||
To distinguish drivers running inside such hardware from the
|
|
||||||
more familiar Linux "USB device drivers",
|
|
||||||
which are host side proxies for the real USB devices,
|
|
||||||
a different term is used:
|
|
||||||
the drivers inside the peripherals are "USB gadget drivers".
|
|
||||||
In USB protocol interactions, the device driver is the master
|
|
||||||
(or "client driver")
|
|
||||||
and the gadget driver is the slave (or "function driver").
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>The gadget API resembles the host side Linux-USB API in that both
|
|
||||||
use queues of request objects to package I/O buffers, and those requests
|
|
||||||
may be submitted or canceled.
|
|
||||||
They share common definitions for the standard USB
|
|
||||||
<emphasis>Chapter 9</emphasis> messages, structures, and constants.
|
|
||||||
Also, both APIs bind and unbind drivers to devices.
|
|
||||||
The APIs differ in detail, since the host side's current
|
|
||||||
URB framework exposes a number of implementation details
|
|
||||||
and assumptions that are inappropriate for a gadget API.
|
|
||||||
While the model for control transfers and configuration
|
|
||||||
management is necessarily different (one side is a hardware-neutral master,
|
|
||||||
the other is a hardware-aware slave), the endpoint I/0 API used here
|
|
||||||
should also be usable for an overhead-reduced host side API.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="structure"><title>Structure of Gadget Drivers</title>
|
|
||||||
|
|
||||||
<para>A system running inside a USB peripheral
|
|
||||||
normally has at least three layers inside the kernel to handle
|
|
||||||
USB protocol processing, and may have additional layers in
|
|
||||||
user space code.
|
|
||||||
The "gadget" API is used by the middle layer to interact
|
|
||||||
with the lowest level (which directly handles hardware).
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>In Linux, from the bottom up, these layers are:
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<variablelist>
|
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
<term><emphasis>USB Controller Driver</emphasis></term>
|
|
||||||
|
|
||||||
<listitem>
|
|
||||||
<para>This is the lowest software level.
|
|
||||||
It is the only layer that talks to hardware,
|
|
||||||
through registers, fifos, dma, irqs, and the like.
|
|
||||||
The <filename><linux/usb/gadget.h></filename> API abstracts
|
|
||||||
the peripheral controller endpoint hardware.
|
|
||||||
That hardware is exposed through endpoint objects, which accept
|
|
||||||
streams of IN/OUT buffers, and through callbacks that interact
|
|
||||||
with gadget drivers.
|
|
||||||
Since normal USB devices only have one upstream
|
|
||||||
port, they only have one of these drivers.
|
|
||||||
The controller driver can support any number of different
|
|
||||||
gadget drivers, but only one of them can be used at a time.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Examples of such controller hardware include
|
|
||||||
the PCI-based NetChip 2280 USB 2.0 high speed controller,
|
|
||||||
the SA-11x0 or PXA-25x UDC (found within many PDAs),
|
|
||||||
and a variety of other products.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</listitem></varlistentry>
|
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
<term><emphasis>Gadget Driver</emphasis></term>
|
|
||||||
|
|
||||||
<listitem>
|
|
||||||
<para>The lower boundary of this driver implements hardware-neutral
|
|
||||||
USB functions, using calls to the controller driver.
|
|
||||||
Because such hardware varies widely in capabilities and restrictions,
|
|
||||||
and is used in embedded environments where space is at a premium,
|
|
||||||
the gadget driver is often configured at compile time
|
|
||||||
to work with endpoints supported by one particular controller.
|
|
||||||
Gadget drivers may be portable to several different controllers,
|
|
||||||
using conditional compilation.
|
|
||||||
(Recent kernels substantially simplify the work involved in
|
|
||||||
supporting new hardware, by <emphasis>autoconfiguring</emphasis>
|
|
||||||
endpoints automatically for many bulk-oriented drivers.)
|
|
||||||
Gadget driver responsibilities include:
|
|
||||||
</para>
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>handling setup requests (ep0 protocol responses)
|
|
||||||
possibly including class-specific functionality
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>returning configuration and string descriptors
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>(re)setting configurations and interface
|
|
||||||
altsettings, including enabling and configuring endpoints
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>handling life cycle events, such as managing
|
|
||||||
bindings to hardware,
|
|
||||||
USB suspend/resume, remote wakeup,
|
|
||||||
and disconnection from the USB host.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>managing IN and OUT transfers on all currently
|
|
||||||
enabled endpoints
|
|
||||||
</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Such drivers may be modules of proprietary code, although
|
|
||||||
that approach is discouraged in the Linux community.
|
|
||||||
</para>
|
|
||||||
</listitem></varlistentry>
|
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
<term><emphasis>Upper Level</emphasis></term>
|
|
||||||
|
|
||||||
<listitem>
|
|
||||||
<para>Most gadget drivers have an upper boundary that connects
|
|
||||||
to some Linux driver or framework in Linux.
|
|
||||||
Through that boundary flows the data which the gadget driver
|
|
||||||
produces and/or consumes through protocol transfers over USB.
|
|
||||||
Examples include:
|
|
||||||
</para>
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>user mode code, using generic (gadgetfs)
|
|
||||||
or application specific files in
|
|
||||||
<filename>/dev</filename>
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>networking subsystem (for network gadgets,
|
|
||||||
like the CDC Ethernet Model gadget driver)
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>data capture drivers, perhaps video4Linux or
|
|
||||||
a scanner driver; or test and measurement hardware.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>input subsystem (for HID gadgets)
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>sound subsystem (for audio gadgets)
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>file system (for PTP gadgets)
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>block i/o subsystem (for usb-storage gadgets)
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>... and more </para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
</listitem></varlistentry>
|
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
<term><emphasis>Additional Layers</emphasis></term>
|
|
||||||
|
|
||||||
<listitem>
|
|
||||||
<para>Other layers may exist.
|
|
||||||
These could include kernel layers, such as network protocol stacks,
|
|
||||||
as well as user mode applications building on standard POSIX
|
|
||||||
system call APIs such as
|
|
||||||
<emphasis>open()</emphasis>, <emphasis>close()</emphasis>,
|
|
||||||
<emphasis>read()</emphasis> and <emphasis>write()</emphasis>.
|
|
||||||
On newer systems, POSIX Async I/O calls may be an option.
|
|
||||||
Such user mode code will not necessarily be subject to
|
|
||||||
the GNU General Public License (GPL).
|
|
||||||
</para>
|
|
||||||
</listitem></varlistentry>
|
|
||||||
|
|
||||||
|
|
||||||
</variablelist>
|
|
||||||
|
|
||||||
<para>OTG-capable systems will also need to include a standard Linux-USB
|
|
||||||
host side stack,
|
|
||||||
with <emphasis>usbcore</emphasis>,
|
|
||||||
one or more <emphasis>Host Controller Drivers</emphasis> (HCDs),
|
|
||||||
<emphasis>USB Device Drivers</emphasis> to support
|
|
||||||
the OTG "Targeted Peripheral List",
|
|
||||||
and so forth.
|
|
||||||
There will also be an <emphasis>OTG Controller Driver</emphasis>,
|
|
||||||
which is visible to gadget and device driver developers only indirectly.
|
|
||||||
That helps the host and device side USB controllers implement the
|
|
||||||
two new OTG protocols (HNP and SRP).
|
|
||||||
Roles switch (host to peripheral, or vice versa) using HNP
|
|
||||||
during USB suspend processing, and SRP can be viewed as a
|
|
||||||
more battery-friendly kind of device wakeup protocol.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Over time, reusable utilities are evolving to help make some
|
|
||||||
gadget driver tasks simpler.
|
|
||||||
For example, building configuration descriptors from vectors of
|
|
||||||
descriptors for the configurations interfaces and endpoints is
|
|
||||||
now automated, and many drivers now use autoconfiguration to
|
|
||||||
choose hardware endpoints and initialize their descriptors.
|
|
||||||
|
|
||||||
A potential example of particular interest
|
|
||||||
is code implementing standard USB-IF protocols for
|
|
||||||
HID, networking, storage, or audio classes.
|
|
||||||
Some developers are interested in KDB or KGDB hooks, to let
|
|
||||||
target hardware be remotely debugged.
|
|
||||||
Most such USB protocol code doesn't need to be hardware-specific,
|
|
||||||
any more than network protocols like X11, HTTP, or NFS are.
|
|
||||||
Such gadget-side interface drivers should eventually be combined,
|
|
||||||
to implement composite devices.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
|
|
||||||
<chapter id="api"><title>Kernel Mode Gadget API</title>
|
|
||||||
|
|
||||||
<para>Gadget drivers declare themselves through a
|
|
||||||
<emphasis>struct usb_gadget_driver</emphasis>, which is responsible for
|
|
||||||
most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>.
|
|
||||||
The response to a set_configuration usually involves
|
|
||||||
enabling one or more of the <emphasis>struct usb_ep</emphasis> objects
|
|
||||||
exposed by the gadget, and submitting one or more
|
|
||||||
<emphasis>struct usb_request</emphasis> buffers to transfer data.
|
|
||||||
Understand those four data types, and their operations, and
|
|
||||||
you will understand how this API works.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<note><title>Incomplete Data Type Descriptions</title>
|
|
||||||
|
|
||||||
<para>This documentation was prepared using the standard Linux
|
|
||||||
kernel <filename>docproc</filename> tool, which turns text
|
|
||||||
and in-code comments into SGML DocBook and then into usable
|
|
||||||
formats such as HTML or PDF.
|
|
||||||
Other than the "Chapter 9" data types, most of the significant
|
|
||||||
data types and functions are described here.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>However, docproc does not understand all the C constructs
|
|
||||||
that are used, so some relevant information is likely omitted from
|
|
||||||
what you are reading.
|
|
||||||
One example of such information is endpoint autoconfiguration.
|
|
||||||
You'll have to read the header file, and use example source
|
|
||||||
code (such as that for "Gadget Zero"), to fully understand the API.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>The part of the API implementing some basic
|
|
||||||
driver capabilities is specific to the version of the
|
|
||||||
Linux kernel that's in use.
|
|
||||||
The 2.6 kernel includes a <emphasis>driver model</emphasis>
|
|
||||||
framework that has no analogue on earlier kernels;
|
|
||||||
so those parts of the gadget API are not fully portable.
|
|
||||||
(They are implemented on 2.4 kernels, but in a different way.)
|
|
||||||
The driver model state is another part of this API that is
|
|
||||||
ignored by the kerneldoc tools.
|
|
||||||
</para>
|
|
||||||
</note>
|
|
||||||
|
|
||||||
<para>The core API does not expose
|
|
||||||
every possible hardware feature, only the most widely available ones.
|
|
||||||
There are significant hardware features, such as device-to-device DMA
|
|
||||||
(without temporary storage in a memory buffer)
|
|
||||||
that would be added using hardware-specific APIs.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>This API allows drivers to use conditional compilation to handle
|
|
||||||
endpoint capabilities of different hardware, but doesn't require that.
|
|
||||||
Hardware tends to have arbitrary restrictions, relating to
|
|
||||||
transfer types, addressing, packet sizes, buffering, and availability.
|
|
||||||
As a rule, such differences only matter for "endpoint zero" logic
|
|
||||||
that handles device configuration and management.
|
|
||||||
The API supports limited run-time
|
|
||||||
detection of capabilities, through naming conventions for endpoints.
|
|
||||||
Many drivers will be able to at least partially autoconfigure
|
|
||||||
themselves.
|
|
||||||
In particular, driver init sections will often have endpoint
|
|
||||||
autoconfiguration logic that scans the hardware's list of endpoints
|
|
||||||
to find ones matching the driver requirements
|
|
||||||
(relying on those conventions), to eliminate some of the most
|
|
||||||
common reasons for conditional compilation.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Like the Linux-USB host side API, this API exposes
|
|
||||||
the "chunky" nature of USB messages: I/O requests are in terms
|
|
||||||
of one or more "packets", and packet boundaries are visible to drivers.
|
|
||||||
Compared to RS-232 serial protocols, USB resembles
|
|
||||||
synchronous protocols like HDLC
|
|
||||||
(N bytes per frame, multipoint addressing, host as the primary
|
|
||||||
station and devices as secondary stations)
|
|
||||||
more than asynchronous ones
|
|
||||||
(tty style: 8 data bits per frame, no parity, one stop bit).
|
|
||||||
So for example the controller drivers won't buffer
|
|
||||||
two single byte writes into a single two-byte USB IN packet,
|
|
||||||
although gadget drivers may do so when they implement
|
|
||||||
protocols where packet boundaries (and "short packets")
|
|
||||||
are not significant.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<sect1 id="lifecycle"><title>Driver Life Cycle</title>
|
|
||||||
|
|
||||||
<para>Gadget drivers make endpoint I/O requests to hardware without
|
|
||||||
needing to know many details of the hardware, but driver
|
|
||||||
setup/configuration code needs to handle some differences.
|
|
||||||
Use the API like this:
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<orderedlist numeration='arabic'>
|
|
||||||
|
|
||||||
<listitem><para>Register a driver for the particular device side
|
|
||||||
usb controller hardware,
|
|
||||||
such as the net2280 on PCI (USB 2.0),
|
|
||||||
sa11x0 or pxa25x as found in Linux PDAs,
|
|
||||||
and so on.
|
|
||||||
At this point the device is logically in the USB ch9 initial state
|
|
||||||
("attached"), drawing no power and not usable
|
|
||||||
(since it does not yet support enumeration).
|
|
||||||
Any host should not see the device, since it's not
|
|
||||||
activated the data line pullup used by the host to
|
|
||||||
detect a device, even if VBUS power is available.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>Register a gadget driver that implements some higher level
|
|
||||||
device function. That will then bind() to a usb_gadget, which
|
|
||||||
activates the data line pullup sometime after detecting VBUS.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>The hardware driver can now start enumerating.
|
|
||||||
The steps it handles are to accept USB power and set_address requests.
|
|
||||||
Other steps are handled by the gadget driver.
|
|
||||||
If the gadget driver module is unloaded before the host starts to
|
|
||||||
enumerate, steps before step 7 are skipped.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>The gadget driver's setup() call returns usb descriptors,
|
|
||||||
based both on what the bus interface hardware provides and on the
|
|
||||||
functionality being implemented.
|
|
||||||
That can involve alternate settings or configurations,
|
|
||||||
unless the hardware prevents such operation.
|
|
||||||
For OTG devices, each configuration descriptor includes
|
|
||||||
an OTG descriptor.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>The gadget driver handles the last step of enumeration,
|
|
||||||
when the USB host issues a set_configuration call.
|
|
||||||
It enables all endpoints used in that configuration,
|
|
||||||
with all interfaces in their default settings.
|
|
||||||
That involves using a list of the hardware's endpoints, enabling each
|
|
||||||
endpoint according to its descriptor.
|
|
||||||
It may also involve using <function>usb_gadget_vbus_draw</function>
|
|
||||||
to let more power be drawn from VBUS, as allowed by that configuration.
|
|
||||||
For OTG devices, setting a configuration may also involve reporting
|
|
||||||
HNP capabilities through a user interface.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>Do real work and perform data transfers, possibly involving
|
|
||||||
changes to interface settings or switching to new configurations, until the
|
|
||||||
device is disconnect()ed from the host.
|
|
||||||
Queue any number of transfer requests to each endpoint.
|
|
||||||
It may be suspended and resumed several times before being disconnected.
|
|
||||||
On disconnect, the drivers go back to step 3 (above).
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
<listitem><para>When the gadget driver module is being unloaded,
|
|
||||||
the driver unbind() callback is issued. That lets the controller
|
|
||||||
driver be unloaded.
|
|
||||||
</para></listitem>
|
|
||||||
|
|
||||||
</orderedlist>
|
|
||||||
|
|
||||||
<para>Drivers will normally be arranged so that just loading the
|
|
||||||
gadget driver module (or statically linking it into a Linux kernel)
|
|
||||||
allows the peripheral device to be enumerated, but some drivers
|
|
||||||
will defer enumeration until some higher level component (like
|
|
||||||
a user mode daemon) enables it.
|
|
||||||
Note that at this lowest level there are no policies about how
|
|
||||||
ep0 configuration logic is implemented,
|
|
||||||
except that it should obey USB specifications.
|
|
||||||
Such issues are in the domain of gadget drivers,
|
|
||||||
including knowing about implementation constraints
|
|
||||||
imposed by some USB controllers
|
|
||||||
or understanding that composite devices might happen to
|
|
||||||
be built by integrating reusable components.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Note that the lifecycle above can be slightly different
|
|
||||||
for OTG devices.
|
|
||||||
Other than providing an additional OTG descriptor in each
|
|
||||||
configuration, only the HNP-related differences are particularly
|
|
||||||
visible to driver code.
|
|
||||||
They involve reporting requirements during the SET_CONFIGURATION
|
|
||||||
request, and the option to invoke HNP during some suspend callbacks.
|
|
||||||
Also, SRP changes the semantics of
|
|
||||||
<function>usb_gadget_wakeup</function>
|
|
||||||
slightly.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title>
|
|
||||||
|
|
||||||
<para>Gadget drivers
|
|
||||||
rely on common USB structures and constants
|
|
||||||
defined in the
|
|
||||||
<filename><linux/usb/ch9.h></filename>
|
|
||||||
header file, which is standard in Linux 2.6 kernels.
|
|
||||||
These are the same types and constants used by host
|
|
||||||
side drivers (and usbcore).
|
|
||||||
</para>
|
|
||||||
|
|
||||||
!Iinclude/linux/usb/ch9.h
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="core"><title>Core Objects and Methods</title>
|
|
||||||
|
|
||||||
<para>These are declared in
|
|
||||||
<filename><linux/usb/gadget.h></filename>,
|
|
||||||
and are used by gadget drivers to interact with
|
|
||||||
USB peripheral controller drivers.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<!-- yeech, this is ugly in nsgmls PDF output.
|
|
||||||
|
|
||||||
the PDF bookmark and refentry output nesting is wrong,
|
|
||||||
and the member/argument documentation indents ugly.
|
|
||||||
|
|
||||||
plus something (docproc?) adds whitespace before the
|
|
||||||
descriptive paragraph text, so it can't line up right
|
|
||||||
unless the explanations are trivial.
|
|
||||||
-->
|
|
||||||
|
|
||||||
!Iinclude/linux/usb/gadget.h
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="utils"><title>Optional Utilities</title>
|
|
||||||
|
|
||||||
<para>The core API is sufficient for writing a USB Gadget Driver,
|
|
||||||
but some optional utilities are provided to simplify common tasks.
|
|
||||||
These utilities include endpoint autoconfiguration.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
!Edrivers/usb/gadget/usbstring.c
|
|
||||||
!Edrivers/usb/gadget/config.c
|
|
||||||
<!-- !Edrivers/usb/gadget/epautoconf.c -->
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="composite"><title>Composite Device Framework</title>
|
|
||||||
|
|
||||||
<para>The core API is sufficient for writing drivers for composite
|
|
||||||
USB devices (with more than one function in a given configuration),
|
|
||||||
and also multi-configuration devices (also more than one function,
|
|
||||||
but not necessarily sharing a given configuration).
|
|
||||||
There is however an optional framework which makes it easier to
|
|
||||||
reuse and combine functions.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Devices using this framework provide a <emphasis>struct
|
|
||||||
usb_composite_driver</emphasis>, which in turn provides one or
|
|
||||||
more <emphasis>struct usb_configuration</emphasis> instances.
|
|
||||||
Each such configuration includes at least one
|
|
||||||
<emphasis>struct usb_function</emphasis>, which packages a user
|
|
||||||
visible role such as "network link" or "mass storage device".
|
|
||||||
Management functions may also exist, such as "Device Firmware
|
|
||||||
Upgrade".
|
|
||||||
</para>
|
|
||||||
|
|
||||||
!Iinclude/linux/usb/composite.h
|
|
||||||
!Edrivers/usb/gadget/composite.c
|
|
||||||
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="functions"><title>Composite Device Functions</title>
|
|
||||||
|
|
||||||
<para>At this writing, a few of the current gadget drivers have
|
|
||||||
been converted to this framework.
|
|
||||||
Near-term plans include converting all of them, except for "gadgetfs".
|
|
||||||
</para>
|
|
||||||
|
|
||||||
!Edrivers/usb/gadget/function/f_acm.c
|
|
||||||
!Edrivers/usb/gadget/function/f_ecm.c
|
|
||||||
!Edrivers/usb/gadget/function/f_subset.c
|
|
||||||
!Edrivers/usb/gadget/function/f_obex.c
|
|
||||||
!Edrivers/usb/gadget/function/f_serial.c
|
|
||||||
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="controllers"><title>Peripheral Controller Drivers</title>
|
|
||||||
|
|
||||||
<para>The first hardware supporting this API was the NetChip 2280
|
|
||||||
controller, which supports USB 2.0 high speed and is based on PCI.
|
|
||||||
This is the <filename>net2280</filename> driver module.
|
|
||||||
The driver supports Linux kernel versions 2.4 and 2.6;
|
|
||||||
contact NetChip Technologies for development boards and product
|
|
||||||
information.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Other hardware working in the "gadget" framework includes:
|
|
||||||
Intel's PXA 25x and IXP42x series processors
|
|
||||||
(<filename>pxa2xx_udc</filename>),
|
|
||||||
Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>),
|
|
||||||
Renesas SH7705/7727 (<filename>sh_udc</filename>),
|
|
||||||
MediaQ 11xx (<filename>mq11xx_udc</filename>),
|
|
||||||
Hynix HMS30C7202 (<filename>h7202_udc</filename>),
|
|
||||||
National 9303/4 (<filename>n9604_udc</filename>),
|
|
||||||
Texas Instruments OMAP (<filename>omap_udc</filename>),
|
|
||||||
Sharp LH7A40x (<filename>lh7a40x_udc</filename>),
|
|
||||||
and more.
|
|
||||||
Most of those are full speed controllers.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>At this writing, there are people at work on drivers in
|
|
||||||
this framework for several other USB device controllers,
|
|
||||||
with plans to make many of them be widely available.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<!-- !Edrivers/usb/gadget/net2280.c -->
|
|
||||||
|
|
||||||
<para>A partial USB simulator,
|
|
||||||
the <filename>dummy_hcd</filename> driver, is available.
|
|
||||||
It can act like a net2280, a pxa25x, or an sa11x0 in terms
|
|
||||||
of available endpoints and device speeds; and it simulates
|
|
||||||
control, bulk, and to some extent interrupt transfers.
|
|
||||||
That lets you develop some parts of a gadget driver on a normal PC,
|
|
||||||
without any special hardware, and perhaps with the assistance
|
|
||||||
of tools such as GDB running with User Mode Linux.
|
|
||||||
At least one person has expressed interest in adapting that
|
|
||||||
approach, hooking it up to a simulator for a microcontroller.
|
|
||||||
Such simulators can help debug subsystems where the runtime hardware
|
|
||||||
is unfriendly to software development, or is not yet available.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Support for other controllers is expected to be developed
|
|
||||||
and contributed
|
|
||||||
over time, as this driver framework evolves.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="gadget"><title>Gadget Drivers</title>
|
|
||||||
|
|
||||||
<para>In addition to <emphasis>Gadget Zero</emphasis>
|
|
||||||
(used primarily for testing and development with drivers
|
|
||||||
for usb controller hardware), other gadget drivers exist.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>There's an <emphasis>ethernet</emphasis> gadget
|
|
||||||
driver, which implements one of the most useful
|
|
||||||
<emphasis>Communications Device Class</emphasis> (CDC) models.
|
|
||||||
One of the standards for cable modem interoperability even
|
|
||||||
specifies the use of this ethernet model as one of two
|
|
||||||
mandatory options.
|
|
||||||
Gadgets using this code look to a USB host as if they're
|
|
||||||
an Ethernet adapter.
|
|
||||||
It provides access to a network where the gadget's CPU is one host,
|
|
||||||
which could easily be bridging, routing, or firewalling
|
|
||||||
access to other networks.
|
|
||||||
Since some hardware can't fully implement the CDC Ethernet
|
|
||||||
requirements, this driver also implements a "good parts only"
|
|
||||||
subset of CDC Ethernet.
|
|
||||||
(That subset doesn't advertise itself as CDC Ethernet,
|
|
||||||
to avoid creating problems.)
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Support for Microsoft's <emphasis>RNDIS</emphasis>
|
|
||||||
protocol has been contributed by Pengutronix and Auerswald GmbH.
|
|
||||||
This is like CDC Ethernet, but it runs on more slightly USB hardware
|
|
||||||
(but less than the CDC subset).
|
|
||||||
However, its main claim to fame is being able to connect directly to
|
|
||||||
recent versions of Windows, using drivers that Microsoft bundles
|
|
||||||
and supports, making it much simpler to network with Windows.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>There is also support for user mode gadget drivers,
|
|
||||||
using <emphasis>gadgetfs</emphasis>.
|
|
||||||
This provides a <emphasis>User Mode API</emphasis> that presents
|
|
||||||
each endpoint as a single file descriptor. I/O is done using
|
|
||||||
normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls.
|
|
||||||
Familiar tools like GDB and pthreads can be used to
|
|
||||||
develop and debug user mode drivers, so that once a robust
|
|
||||||
controller driver is available many applications for it
|
|
||||||
won't require new kernel mode software.
|
|
||||||
Linux 2.6 <emphasis>Async I/O (AIO)</emphasis>
|
|
||||||
support is available, so that user mode software
|
|
||||||
can stream data with only slightly more overhead
|
|
||||||
than a kernel driver.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>There's a USB Mass Storage class driver, which provides
|
|
||||||
a different solution for interoperability with systems such
|
|
||||||
as MS-Windows and MacOS.
|
|
||||||
That <emphasis>Mass Storage</emphasis> driver uses a
|
|
||||||
file or block device as backing store for a drive,
|
|
||||||
like the <filename>loop</filename> driver.
|
|
||||||
The USB host uses the BBB, CB, or CBI versions of the mass
|
|
||||||
storage class specification, using transparent SCSI commands
|
|
||||||
to access the data from the backing store.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>There's a "serial line" driver, useful for TTY style
|
|
||||||
operation over USB.
|
|
||||||
The latest version of that driver supports CDC ACM style
|
|
||||||
operation, like a USB modem, and so on most hardware it can
|
|
||||||
interoperate easily with MS-Windows.
|
|
||||||
One interesting use of that driver is in boot firmware (like a BIOS),
|
|
||||||
which can sometimes use that model with very small systems without
|
|
||||||
real serial lines.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Support for other kinds of gadget is expected to
|
|
||||||
be developed and contributed
|
|
||||||
over time, as this driver framework evolves.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="otg"><title>USB On-The-GO (OTG)</title>
|
|
||||||
|
|
||||||
<para>USB OTG support on Linux 2.6 was initially developed
|
|
||||||
by Texas Instruments for
|
|
||||||
<ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx
|
|
||||||
series processors.
|
|
||||||
Other OTG systems should work in similar ways, but the
|
|
||||||
hardware level details could be very different.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>Systems need specialized hardware support to implement OTG,
|
|
||||||
notably including a special <emphasis>Mini-AB</emphasis> jack
|
|
||||||
and associated transceiver to support <emphasis>Dual-Role</emphasis>
|
|
||||||
operation:
|
|
||||||
they can act either as a host, using the standard
|
|
||||||
Linux-USB host side driver stack,
|
|
||||||
or as a peripheral, using this "gadget" framework.
|
|
||||||
To do that, the system software relies on small additions
|
|
||||||
to those programming interfaces,
|
|
||||||
and on a new internal component (here called an "OTG Controller")
|
|
||||||
affecting which driver stack connects to the OTG port.
|
|
||||||
In each role, the system can re-use the existing pool of
|
|
||||||
hardware-neutral drivers, layered on top of the controller
|
|
||||||
driver interfaces (<emphasis>usb_bus</emphasis> or
|
|
||||||
<emphasis>usb_gadget</emphasis>).
|
|
||||||
Such drivers need at most minor changes, and most of the calls
|
|
||||||
added to support OTG can also benefit non-OTG products.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis>
|
|
||||||
flag, and use it to determine whether or not to include
|
|
||||||
an OTG descriptor in each of their configurations.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Gadget drivers may need changes to support the
|
|
||||||
two new OTG protocols, exposed in new gadget attributes
|
|
||||||
such as <emphasis>b_hnp_enable</emphasis> flag.
|
|
||||||
HNP support should be reported through a user interface
|
|
||||||
(two LEDs could suffice), and is triggered in some cases
|
|
||||||
when the host suspends the peripheral.
|
|
||||||
SRP support can be user-initiated just like remote wakeup,
|
|
||||||
probably by pressing the same button.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>On the host side, USB device drivers need
|
|
||||||
to be taught to trigger HNP at appropriate moments, using
|
|
||||||
<function>usb_suspend_device()</function>.
|
|
||||||
That also conserves battery power, which is useful even
|
|
||||||
for non-OTG configurations.
|
|
||||||
</para></listitem>
|
|
||||||
<listitem><para>Also on the host side, a driver must support the
|
|
||||||
OTG "Targeted Peripheral List". That's just a whitelist,
|
|
||||||
used to reject peripherals not supported with a given
|
|
||||||
Linux OTG host.
|
|
||||||
<emphasis>This whitelist is product-specific;
|
|
||||||
each product must modify <filename>otg_whitelist.h</filename>
|
|
||||||
to match its interoperability specification.
|
|
||||||
</emphasis>
|
|
||||||
</para>
|
|
||||||
<para>Non-OTG Linux hosts, like PCs and workstations,
|
|
||||||
normally have some solution for adding drivers, so that
|
|
||||||
peripherals that aren't recognized can eventually be supported.
|
|
||||||
That approach is unreasonable for consumer products that may
|
|
||||||
never have their firmware upgraded, and where it's usually
|
|
||||||
unrealistic to expect traditional PC/workstation/server kinds
|
|
||||||
of support model to work.
|
|
||||||
For example, it's often impractical to change device firmware
|
|
||||||
once the product has been distributed, so driver bugs can't
|
|
||||||
normally be fixed if they're found after shipment.
|
|
||||||
</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Additional changes are needed below those hardware-neutral
|
|
||||||
<emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis>
|
|
||||||
driver interfaces; those aren't discussed here in any detail.
|
|
||||||
Those affect the hardware-specific code for each USB Host or Peripheral
|
|
||||||
controller, and how the HCD initializes (since OTG can be active only
|
|
||||||
on a single port).
|
|
||||||
They also involve what may be called an <emphasis>OTG Controller
|
|
||||||
Driver</emphasis>, managing the OTG transceiver and the OTG state
|
|
||||||
machine logic as well as much of the root hub behavior for the
|
|
||||||
OTG port.
|
|
||||||
The OTG controller driver needs to activate and deactivate USB
|
|
||||||
controllers depending on the relevant device role.
|
|
||||||
Some related changes were needed inside usbcore, so that it
|
|
||||||
can identify OTG-capable devices and respond appropriately
|
|
||||||
to HNP or SRP protocols.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
</book>
|
|
||||||
<!--
|
|
||||||
vim:syntax=sgml:sw=4
|
|
||||||
-->
|
|
@@ -1,520 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
|
||||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
|
||||||
|
|
||||||
<book id="Generic-IRQ-Guide">
|
|
||||||
<bookinfo>
|
|
||||||
<title>Linux generic IRQ handling</title>
|
|
||||||
|
|
||||||
<authorgroup>
|
|
||||||
<author>
|
|
||||||
<firstname>Thomas</firstname>
|
|
||||||
<surname>Gleixner</surname>
|
|
||||||
<affiliation>
|
|
||||||
<address>
|
|
||||||
<email>tglx@linutronix.de</email>
|
|
||||||
</address>
|
|
||||||
</affiliation>
|
|
||||||
</author>
|
|
||||||
<author>
|
|
||||||
<firstname>Ingo</firstname>
|
|
||||||
<surname>Molnar</surname>
|
|
||||||
<affiliation>
|
|
||||||
<address>
|
|
||||||
<email>mingo@elte.hu</email>
|
|
||||||
</address>
|
|
||||||
</affiliation>
|
|
||||||
</author>
|
|
||||||
</authorgroup>
|
|
||||||
|
|
||||||
<copyright>
|
|
||||||
<year>2005-2010</year>
|
|
||||||
<holder>Thomas Gleixner</holder>
|
|
||||||
</copyright>
|
|
||||||
<copyright>
|
|
||||||
<year>2005-2006</year>
|
|
||||||
<holder>Ingo Molnar</holder>
|
|
||||||
</copyright>
|
|
||||||
|
|
||||||
<legalnotice>
|
|
||||||
<para>
|
|
||||||
This documentation is free software; you can redistribute
|
|
||||||
it and/or modify it under the terms of the GNU General Public
|
|
||||||
License version 2 as published by the Free Software Foundation.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This program is distributed in the hope that it will be
|
|
||||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
||||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
||||||
See the GNU General Public License for more details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
You should have received a copy of the GNU General Public
|
|
||||||
License along with this program; if not, write to the Free
|
|
||||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
||||||
MA 02111-1307 USA
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
For more details see the file COPYING in the source
|
|
||||||
distribution of Linux.
|
|
||||||
</para>
|
|
||||||
</legalnotice>
|
|
||||||
</bookinfo>
|
|
||||||
|
|
||||||
<toc></toc>
|
|
||||||
|
|
||||||
<chapter id="intro">
|
|
||||||
<title>Introduction</title>
|
|
||||||
<para>
|
|
||||||
The generic interrupt handling layer is designed to provide a
|
|
||||||
complete abstraction of interrupt handling for device drivers.
|
|
||||||
It is able to handle all the different types of interrupt controller
|
|
||||||
hardware. Device drivers use generic API functions to request, enable,
|
|
||||||
disable and free interrupts. The drivers do not have to know anything
|
|
||||||
about interrupt hardware details, so they can be used on different
|
|
||||||
platforms without code changes.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
This documentation is provided to developers who want to implement
|
|
||||||
an interrupt subsystem based for their architecture, with the help
|
|
||||||
of the generic IRQ handling layer.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="rationale">
|
|
||||||
<title>Rationale</title>
|
|
||||||
<para>
|
|
||||||
The original implementation of interrupt handling in Linux uses
|
|
||||||
the __do_IRQ() super-handler, which is able to deal with every
|
|
||||||
type of interrupt logic.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Originally, Russell King identified different types of handlers to
|
|
||||||
build a quite universal set for the ARM interrupt handler
|
|
||||||
implementation in Linux 2.5/2.6. He distinguished between:
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>Level type</para></listitem>
|
|
||||||
<listitem><para>Edge type</para></listitem>
|
|
||||||
<listitem><para>Simple type</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
During the implementation we identified another type:
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>Fast EOI type</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
In the SMP world of the __do_IRQ() super-handler another type
|
|
||||||
was identified:
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>Per CPU type</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
This split implementation of high-level IRQ handlers allows us to
|
|
||||||
optimize the flow of the interrupt handling for each specific
|
|
||||||
interrupt type. This reduces complexity in that particular code path
|
|
||||||
and allows the optimized handling of a given type.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The original general IRQ implementation used hw_interrupt_type
|
|
||||||
structures and their ->ack(), ->end() [etc.] callbacks to
|
|
||||||
differentiate the flow control in the super-handler. This leads to
|
|
||||||
a mix of flow logic and low-level hardware logic, and it also leads
|
|
||||||
to unnecessary code duplication: for example in i386, there is an
|
|
||||||
ioapic_level_irq and an ioapic_edge_irq IRQ-type which share many
|
|
||||||
of the low-level details but have different flow handling.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
A more natural abstraction is the clean separation of the
|
|
||||||
'irq flow' and the 'chip details'.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Analysing a couple of architecture's IRQ subsystem implementations
|
|
||||||
reveals that most of them can use a generic set of 'irq flow'
|
|
||||||
methods and only need to add the chip-level specific code.
|
|
||||||
The separation is also valuable for (sub)architectures
|
|
||||||
which need specific quirks in the IRQ flow itself but not in the
|
|
||||||
chip details - and thus provides a more transparent IRQ subsystem
|
|
||||||
design.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Each interrupt descriptor is assigned its own high-level flow
|
|
||||||
handler, which is normally one of the generic
|
|
||||||
implementations. (This high-level flow handler implementation also
|
|
||||||
makes it simple to provide demultiplexing handlers which can be
|
|
||||||
found in embedded platforms on various architectures.)
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The separation makes the generic interrupt handling layer more
|
|
||||||
flexible and extensible. For example, an (sub)architecture can
|
|
||||||
use a generic IRQ-flow implementation for 'level type' interrupts
|
|
||||||
and add a (sub)architecture specific 'edge type' implementation.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
To make the transition to the new model easier and prevent the
|
|
||||||
breakage of existing implementations, the __do_IRQ() super-handler
|
|
||||||
is still available. This leads to a kind of duality for the time
|
|
||||||
being. Over time the new model should be used in more and more
|
|
||||||
architectures, as it enables smaller and cleaner IRQ subsystems.
|
|
||||||
It's deprecated for three years now and about to be removed.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
<chapter id="bugs">
|
|
||||||
<title>Known Bugs And Assumptions</title>
|
|
||||||
<para>
|
|
||||||
None (knock on wood).
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="Abstraction">
|
|
||||||
<title>Abstraction layers</title>
|
|
||||||
<para>
|
|
||||||
There are three main levels of abstraction in the interrupt code:
|
|
||||||
<orderedlist>
|
|
||||||
<listitem><para>High-level driver API</para></listitem>
|
|
||||||
<listitem><para>High-level IRQ flow handlers</para></listitem>
|
|
||||||
<listitem><para>Chip-level hardware encapsulation</para></listitem>
|
|
||||||
</orderedlist>
|
|
||||||
</para>
|
|
||||||
<sect1 id="Interrupt_control_flow">
|
|
||||||
<title>Interrupt control flow</title>
|
|
||||||
<para>
|
|
||||||
Each interrupt is described by an interrupt descriptor structure
|
|
||||||
irq_desc. The interrupt is referenced by an 'unsigned int' numeric
|
|
||||||
value which selects the corresponding interrupt description structure
|
|
||||||
in the descriptor structures array.
|
|
||||||
The descriptor structure contains status information and pointers
|
|
||||||
to the interrupt flow method and the interrupt chip structure
|
|
||||||
which are assigned to this interrupt.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Whenever an interrupt triggers, the low-level architecture code calls
|
|
||||||
into the generic interrupt code by calling desc->handle_irq().
|
|
||||||
This high-level IRQ handling function only uses desc->irq_data.chip
|
|
||||||
primitives referenced by the assigned chip descriptor structure.
|
|
||||||
</para>
|
|
||||||
</sect1>
|
|
||||||
<sect1 id="Highlevel_Driver_API">
|
|
||||||
<title>High-level Driver API</title>
|
|
||||||
<para>
|
|
||||||
The high-level Driver API consists of following functions:
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>request_irq()</para></listitem>
|
|
||||||
<listitem><para>free_irq()</para></listitem>
|
|
||||||
<listitem><para>disable_irq()</para></listitem>
|
|
||||||
<listitem><para>enable_irq()</para></listitem>
|
|
||||||
<listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
|
|
||||||
<listitem><para>synchronize_irq() (SMP only)</para></listitem>
|
|
||||||
<listitem><para>irq_set_irq_type()</para></listitem>
|
|
||||||
<listitem><para>irq_set_irq_wake()</para></listitem>
|
|
||||||
<listitem><para>irq_set_handler_data()</para></listitem>
|
|
||||||
<listitem><para>irq_set_chip()</para></listitem>
|
|
||||||
<listitem><para>irq_set_chip_data()</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
See the autogenerated function documentation for details.
|
|
||||||
</para>
|
|
||||||
</sect1>
|
|
||||||
<sect1 id="Highlevel_IRQ_flow_handlers">
|
|
||||||
<title>High-level IRQ flow handlers</title>
|
|
||||||
<para>
|
|
||||||
The generic layer provides a set of pre-defined irq-flow methods:
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>handle_level_irq</para></listitem>
|
|
||||||
<listitem><para>handle_edge_irq</para></listitem>
|
|
||||||
<listitem><para>handle_fasteoi_irq</para></listitem>
|
|
||||||
<listitem><para>handle_simple_irq</para></listitem>
|
|
||||||
<listitem><para>handle_percpu_irq</para></listitem>
|
|
||||||
<listitem><para>handle_edge_eoi_irq</para></listitem>
|
|
||||||
<listitem><para>handle_bad_irq</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
The interrupt flow handlers (either pre-defined or architecture
|
|
||||||
specific) are assigned to specific interrupts by the architecture
|
|
||||||
either during bootup or during device initialization.
|
|
||||||
</para>
|
|
||||||
<sect2 id="Default_flow_implementations">
|
|
||||||
<title>Default flow implementations</title>
|
|
||||||
<sect3 id="Helper_functions">
|
|
||||||
<title>Helper functions</title>
|
|
||||||
<para>
|
|
||||||
The helper functions call the chip primitives and
|
|
||||||
are used by the default flow implementations.
|
|
||||||
The following helper functions are implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
default_enable(struct irq_data *data)
|
|
||||||
{
|
|
||||||
desc->irq_data.chip->irq_unmask(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
default_disable(struct irq_data *data)
|
|
||||||
{
|
|
||||||
if (!delay_disable(data))
|
|
||||||
desc->irq_data.chip->irq_mask(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
default_ack(struct irq_data *data)
|
|
||||||
{
|
|
||||||
chip->irq_ack(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
default_mask_ack(struct irq_data *data)
|
|
||||||
{
|
|
||||||
if (chip->irq_mask_ack) {
|
|
||||||
chip->irq_mask_ack(data);
|
|
||||||
} else {
|
|
||||||
chip->irq_mask(data);
|
|
||||||
chip->irq_ack(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
noop(struct irq_data *data))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
</sect2>
|
|
||||||
<sect2 id="Default_flow_handler_implementations">
|
|
||||||
<title>Default flow handler implementations</title>
|
|
||||||
<sect3 id="Default_Level_IRQ_flow_handler">
|
|
||||||
<title>Default Level IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_level_irq provides a generic implementation
|
|
||||||
for level-triggered interrupts.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The following control flow is implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
desc->irq_data.chip->irq_mask_ack();
|
|
||||||
handle_irq_event(desc->action);
|
|
||||||
desc->irq_data.chip->irq_unmask();
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="Default_FASTEOI_IRQ_flow_handler">
|
|
||||||
<title>Default Fast EOI IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_fasteoi_irq provides a generic implementation
|
|
||||||
for interrupts, which only need an EOI at the end of
|
|
||||||
the handler.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The following control flow is implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
handle_irq_event(desc->action);
|
|
||||||
desc->irq_data.chip->irq_eoi();
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="Default_Edge_IRQ_flow_handler">
|
|
||||||
<title>Default Edge IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_edge_irq provides a generic implementation
|
|
||||||
for edge-triggered interrupts.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The following control flow is implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
if (desc->status & running) {
|
|
||||||
desc->irq_data.chip->irq_mask_ack();
|
|
||||||
desc->status |= pending | masked;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
desc->irq_data.chip->irq_ack();
|
|
||||||
desc->status |= running;
|
|
||||||
do {
|
|
||||||
if (desc->status & masked)
|
|
||||||
desc->irq_data.chip->irq_unmask();
|
|
||||||
desc->status &= ~pending;
|
|
||||||
handle_irq_event(desc->action);
|
|
||||||
} while (status & pending);
|
|
||||||
desc->status &= ~running;
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="Default_simple_IRQ_flow_handler">
|
|
||||||
<title>Default simple IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_simple_irq provides a generic implementation
|
|
||||||
for simple interrupts.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Note: The simple flow handler does not call any
|
|
||||||
handler/chip primitives.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The following control flow is implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
handle_irq_event(desc->action);
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="Default_per_CPU_flow_handler">
|
|
||||||
<title>Default per CPU flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_percpu_irq provides a generic implementation
|
|
||||||
for per CPU interrupts.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Per CPU interrupts are only available on SMP and
|
|
||||||
the handler provides a simplified version without
|
|
||||||
locking.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The following control flow is implemented (simplified excerpt):
|
|
||||||
<programlisting>
|
|
||||||
if (desc->irq_data.chip->irq_ack)
|
|
||||||
desc->irq_data.chip->irq_ack();
|
|
||||||
handle_irq_event(desc->action);
|
|
||||||
if (desc->irq_data.chip->irq_eoi)
|
|
||||||
desc->irq_data.chip->irq_eoi();
|
|
||||||
</programlisting>
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="EOI_Edge_IRQ_flow_handler">
|
|
||||||
<title>EOI Edge IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_edge_eoi_irq provides an abnomination of the edge
|
|
||||||
handler which is solely used to tame a badly wreckaged
|
|
||||||
irq controller on powerpc/cell.
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
<sect3 id="BAD_IRQ_flow_handler">
|
|
||||||
<title>Bad IRQ flow handler</title>
|
|
||||||
<para>
|
|
||||||
handle_bad_irq is used for spurious interrupts which
|
|
||||||
have no real handler assigned..
|
|
||||||
</para>
|
|
||||||
</sect3>
|
|
||||||
</sect2>
|
|
||||||
<sect2 id="Quirks_and_optimizations">
|
|
||||||
<title>Quirks and optimizations</title>
|
|
||||||
<para>
|
|
||||||
The generic functions are intended for 'clean' architectures and chips,
|
|
||||||
which have no platform-specific IRQ handling quirks. If an architecture
|
|
||||||
needs to implement quirks on the 'flow' level then it can do so by
|
|
||||||
overriding the high-level irq-flow handler.
|
|
||||||
</para>
|
|
||||||
</sect2>
|
|
||||||
<sect2 id="Delayed_interrupt_disable">
|
|
||||||
<title>Delayed interrupt disable</title>
|
|
||||||
<para>
|
|
||||||
This per interrupt selectable feature, which was introduced by Russell
|
|
||||||
King in the ARM interrupt implementation, does not mask an interrupt
|
|
||||||
at the hardware level when disable_irq() is called. The interrupt is
|
|
||||||
kept enabled and is masked in the flow handler when an interrupt event
|
|
||||||
happens. This prevents losing edge interrupts on hardware which does
|
|
||||||
not store an edge interrupt event while the interrupt is disabled at
|
|
||||||
the hardware level. When an interrupt arrives while the IRQ_DISABLED
|
|
||||||
flag is set, then the interrupt is masked at the hardware level and
|
|
||||||
the IRQ_PENDING bit is set. When the interrupt is re-enabled by
|
|
||||||
enable_irq() the pending bit is checked and if it is set, the
|
|
||||||
interrupt is resent either via hardware or by a software resend
|
|
||||||
mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
|
|
||||||
you want to use the delayed interrupt disable feature and your
|
|
||||||
hardware is not capable of retriggering an interrupt.)
|
|
||||||
The delayed interrupt disable is not configurable.
|
|
||||||
</para>
|
|
||||||
</sect2>
|
|
||||||
</sect1>
|
|
||||||
<sect1 id="Chiplevel_hardware_encapsulation">
|
|
||||||
<title>Chip-level hardware encapsulation</title>
|
|
||||||
<para>
|
|
||||||
The chip-level hardware descriptor structure irq_chip
|
|
||||||
contains all the direct chip relevant functions, which
|
|
||||||
can be utilized by the irq flow implementations.
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem><para>irq_ack()</para></listitem>
|
|
||||||
<listitem><para>irq_mask_ack() - Optional, recommended for performance</para></listitem>
|
|
||||||
<listitem><para>irq_mask()</para></listitem>
|
|
||||||
<listitem><para>irq_unmask()</para></listitem>
|
|
||||||
<listitem><para>irq_eoi() - Optional, required for EOI flow handlers</para></listitem>
|
|
||||||
<listitem><para>irq_retrigger() - Optional</para></listitem>
|
|
||||||
<listitem><para>irq_set_type() - Optional</para></listitem>
|
|
||||||
<listitem><para>irq_set_wake() - Optional</para></listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
These primitives are strictly intended to mean what they say: ack means
|
|
||||||
ACK, masking means masking of an IRQ line, etc. It is up to the flow
|
|
||||||
handler(s) to use these basic units of low-level functionality.
|
|
||||||
</para>
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="doirq">
|
|
||||||
<title>__do_IRQ entry point</title>
|
|
||||||
<para>
|
|
||||||
The original implementation __do_IRQ() was an alternative entry
|
|
||||||
point for all types of interrupts. It no longer exists.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
This handler turned out to be not suitable for all
|
|
||||||
interrupt hardware and was therefore reimplemented with split
|
|
||||||
functionality for edge/level/simple/percpu interrupts. This is not
|
|
||||||
only a functional optimization. It also shortens code paths for
|
|
||||||
interrupts.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="locking">
|
|
||||||
<title>Locking on SMP</title>
|
|
||||||
<para>
|
|
||||||
The locking of chip registers is up to the architecture that
|
|
||||||
defines the chip primitives. The per-irq structure is
|
|
||||||
protected via desc->lock, by the generic layer.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="genericchip">
|
|
||||||
<title>Generic interrupt chip</title>
|
|
||||||
<para>
|
|
||||||
To avoid copies of identical implementations of IRQ chips the
|
|
||||||
core provides a configurable generic interrupt chip
|
|
||||||
implementation. Developers should check carefully whether the
|
|
||||||
generic chip fits their needs before implementing the same
|
|
||||||
functionality slightly differently themselves.
|
|
||||||
</para>
|
|
||||||
!Ekernel/irq/generic-chip.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="structs">
|
|
||||||
<title>Structures</title>
|
|
||||||
<para>
|
|
||||||
This chapter contains the autogenerated documentation of the structures which are
|
|
||||||
used in the generic IRQ layer.
|
|
||||||
</para>
|
|
||||||
!Iinclude/linux/irq.h
|
|
||||||
!Iinclude/linux/interrupt.h
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="pubfunctions">
|
|
||||||
<title>Public Functions Provided</title>
|
|
||||||
<para>
|
|
||||||
This chapter contains the autogenerated documentation of the kernel API functions
|
|
||||||
which are exported.
|
|
||||||
</para>
|
|
||||||
!Ekernel/irq/manage.c
|
|
||||||
!Ekernel/irq/chip.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="intfunctions">
|
|
||||||
<title>Internal Functions Provided</title>
|
|
||||||
<para>
|
|
||||||
This chapter contains the autogenerated documentation of the internal functions.
|
|
||||||
</para>
|
|
||||||
!Ikernel/irq/irqdesc.c
|
|
||||||
!Ikernel/irq/handle.c
|
|
||||||
!Ikernel/irq/chip.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="credits">
|
|
||||||
<title>Credits</title>
|
|
||||||
<para>
|
|
||||||
The following people have contributed to this document:
|
|
||||||
<orderedlist>
|
|
||||||
<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
|
|
||||||
<listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
|
|
||||||
</orderedlist>
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
</book>
|
|
@@ -1,331 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
|
||||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
|
||||||
|
|
||||||
<book id="LinuxKernelAPI">
|
|
||||||
<bookinfo>
|
|
||||||
<title>The Linux Kernel API</title>
|
|
||||||
|
|
||||||
<legalnotice>
|
|
||||||
<para>
|
|
||||||
This documentation is free software; you can redistribute
|
|
||||||
it and/or modify it under the terms of the GNU General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2 of the License, or (at your option) any later
|
|
||||||
version.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This program is distributed in the hope that it will be
|
|
||||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
||||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
||||||
See the GNU General Public License for more details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
You should have received a copy of the GNU General Public
|
|
||||||
License along with this program; if not, write to the Free
|
|
||||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
||||||
MA 02111-1307 USA
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
For more details see the file COPYING in the source
|
|
||||||
distribution of Linux.
|
|
||||||
</para>
|
|
||||||
</legalnotice>
|
|
||||||
</bookinfo>
|
|
||||||
|
|
||||||
<toc></toc>
|
|
||||||
|
|
||||||
<chapter id="adt">
|
|
||||||
<title>Data Types</title>
|
|
||||||
<sect1><title>Doubly Linked Lists</title>
|
|
||||||
!Iinclude/linux/list.h
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="libc">
|
|
||||||
<title>Basic C Library Functions</title>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
When writing drivers, you cannot in general use routines which are
|
|
||||||
from the C Library. Some of the functions have been found generally
|
|
||||||
useful and they are listed below. The behaviour of these functions
|
|
||||||
may vary slightly from those defined by ANSI, and these deviations
|
|
||||||
are noted in the text.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<sect1><title>String Conversions</title>
|
|
||||||
!Elib/vsprintf.c
|
|
||||||
!Finclude/linux/kernel.h kstrtol
|
|
||||||
!Finclude/linux/kernel.h kstrtoul
|
|
||||||
!Elib/kstrtox.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>String Manipulation</title>
|
|
||||||
<!-- All functions are exported at now
|
|
||||||
X!Ilib/string.c
|
|
||||||
-->
|
|
||||||
!Elib/string.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>Bit Operations</title>
|
|
||||||
!Iarch/x86/include/asm/bitops.h
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="kernel-lib">
|
|
||||||
<title>Basic Kernel Library Functions</title>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
The Linux kernel provides more basic utility functions.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<sect1><title>Bitmap Operations</title>
|
|
||||||
!Elib/bitmap.c
|
|
||||||
!Ilib/bitmap.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1><title>Command-line Parsing</title>
|
|
||||||
!Elib/cmdline.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="crc"><title>CRC Functions</title>
|
|
||||||
!Elib/crc7.c
|
|
||||||
!Elib/crc16.c
|
|
||||||
!Elib/crc-itu-t.c
|
|
||||||
!Elib/crc32.c
|
|
||||||
!Elib/crc-ccitt.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1 id="idr"><title>idr/ida Functions</title>
|
|
||||||
!Pinclude/linux/idr.h idr sync
|
|
||||||
!Plib/idr.c IDA description
|
|
||||||
!Elib/idr.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="mm">
|
|
||||||
<title>Memory Management in Linux</title>
|
|
||||||
<sect1><title>The Slab Cache</title>
|
|
||||||
!Iinclude/linux/slab.h
|
|
||||||
!Emm/slab.c
|
|
||||||
!Emm/util.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>User Space Memory Access</title>
|
|
||||||
!Iarch/x86/include/asm/uaccess_32.h
|
|
||||||
!Earch/x86/lib/usercopy_32.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>More Memory Management Functions</title>
|
|
||||||
!Emm/readahead.c
|
|
||||||
!Emm/filemap.c
|
|
||||||
!Emm/memory.c
|
|
||||||
!Emm/vmalloc.c
|
|
||||||
!Imm/page_alloc.c
|
|
||||||
!Emm/mempool.c
|
|
||||||
!Emm/dmapool.c
|
|
||||||
!Emm/page-writeback.c
|
|
||||||
!Emm/truncate.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
|
|
||||||
<chapter id="ipc">
|
|
||||||
<title>Kernel IPC facilities</title>
|
|
||||||
|
|
||||||
<sect1><title>IPC utilities</title>
|
|
||||||
!Iipc/util.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="kfifo">
|
|
||||||
<title>FIFO Buffer</title>
|
|
||||||
<sect1><title>kfifo interface</title>
|
|
||||||
!Iinclude/linux/kfifo.h
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="relayfs">
|
|
||||||
<title>relay interface support</title>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Relay interface support
|
|
||||||
is designed to provide an efficient mechanism for tools and
|
|
||||||
facilities to relay large amounts of data from kernel space to
|
|
||||||
user space.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<sect1><title>relay interface</title>
|
|
||||||
!Ekernel/relay.c
|
|
||||||
!Ikernel/relay.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="modload">
|
|
||||||
<title>Module Support</title>
|
|
||||||
<sect1><title>Module Loading</title>
|
|
||||||
!Ekernel/kmod.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>Inter Module support</title>
|
|
||||||
<para>
|
|
||||||
Refer to the file kernel/module.c for more information.
|
|
||||||
</para>
|
|
||||||
<!-- FIXME: Removed for now since no structured comments in source
|
|
||||||
X!Ekernel/module.c
|
|
||||||
-->
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="hardware">
|
|
||||||
<title>Hardware Interfaces</title>
|
|
||||||
<sect1><title>Interrupt Handling</title>
|
|
||||||
!Ekernel/irq/manage.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1><title>DMA Channels</title>
|
|
||||||
!Ekernel/dma.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1><title>Resources Management</title>
|
|
||||||
!Ikernel/resource.c
|
|
||||||
!Ekernel/resource.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1><title>MTRR Handling</title>
|
|
||||||
!Earch/x86/kernel/cpu/mtrr/main.c
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
<sect1><title>PCI Support Library</title>
|
|
||||||
!Edrivers/pci/pci.c
|
|
||||||
!Edrivers/pci/pci-driver.c
|
|
||||||
!Edrivers/pci/remove.c
|
|
||||||
!Edrivers/pci/search.c
|
|
||||||
!Edrivers/pci/msi.c
|
|
||||||
!Edrivers/pci/bus.c
|
|
||||||
!Edrivers/pci/access.c
|
|
||||||
!Edrivers/pci/irq.c
|
|
||||||
!Edrivers/pci/htirq.c
|
|
||||||
<!-- FIXME: Removed for now since no structured comments in source
|
|
||||||
X!Edrivers/pci/hotplug.c
|
|
||||||
-->
|
|
||||||
!Edrivers/pci/probe.c
|
|
||||||
!Edrivers/pci/slot.c
|
|
||||||
!Edrivers/pci/rom.c
|
|
||||||
!Edrivers/pci/iov.c
|
|
||||||
!Idrivers/pci/pci-sysfs.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>PCI Hotplug Support Library</title>
|
|
||||||
!Edrivers/pci/hotplug/pci_hotplug_core.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="firmware">
|
|
||||||
<title>Firmware Interfaces</title>
|
|
||||||
<sect1><title>DMI Interfaces</title>
|
|
||||||
!Edrivers/firmware/dmi_scan.c
|
|
||||||
</sect1>
|
|
||||||
<sect1><title>EDD Interfaces</title>
|
|
||||||
!Idrivers/firmware/edd.c
|
|
||||||
</sect1>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="security">
|
|
||||||
<title>Security Framework</title>
|
|
||||||
!Isecurity/security.c
|
|
||||||
!Esecurity/inode.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="audit">
|
|
||||||
<title>Audit Interfaces</title>
|
|
||||||
!Ekernel/audit.c
|
|
||||||
!Ikernel/auditsc.c
|
|
||||||
!Ikernel/auditfilter.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="accounting">
|
|
||||||
<title>Accounting Framework</title>
|
|
||||||
!Ikernel/acct.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="blkdev">
|
|
||||||
<title>Block Devices</title>
|
|
||||||
!Eblock/blk-core.c
|
|
||||||
!Iblock/blk-core.c
|
|
||||||
!Eblock/blk-map.c
|
|
||||||
!Iblock/blk-sysfs.c
|
|
||||||
!Eblock/blk-settings.c
|
|
||||||
!Eblock/blk-exec.c
|
|
||||||
!Eblock/blk-flush.c
|
|
||||||
!Eblock/blk-lib.c
|
|
||||||
!Eblock/blk-tag.c
|
|
||||||
!Iblock/blk-tag.c
|
|
||||||
!Eblock/blk-integrity.c
|
|
||||||
!Ikernel/trace/blktrace.c
|
|
||||||
!Iblock/genhd.c
|
|
||||||
!Eblock/genhd.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="chrdev">
|
|
||||||
<title>Char devices</title>
|
|
||||||
!Efs/char_dev.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="miscdev">
|
|
||||||
<title>Miscellaneous Devices</title>
|
|
||||||
!Edrivers/char/misc.c
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="clk">
|
|
||||||
<title>Clock Framework</title>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
The clock framework defines programming interfaces to support
|
|
||||||
software management of the system clock tree.
|
|
||||||
This framework is widely used with System-On-Chip (SOC) platforms
|
|
||||||
to support power management and various devices which may need
|
|
||||||
custom clock rates.
|
|
||||||
Note that these "clocks" don't relate to timekeeping or real
|
|
||||||
time clocks (RTCs), each of which have separate frameworks.
|
|
||||||
These <structname>struct clk</structname> instances may be used
|
|
||||||
to manage for example a 96 MHz signal that is used to shift bits
|
|
||||||
into and out of peripherals or busses, or otherwise trigger
|
|
||||||
synchronous state machine transitions in system hardware.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Power management is supported by explicit software clock gating:
|
|
||||||
unused clocks are disabled, so the system doesn't waste power
|
|
||||||
changing the state of transistors that aren't in active use.
|
|
||||||
On some systems this may be backed by hardware clock gating,
|
|
||||||
where clocks are gated without being disabled in software.
|
|
||||||
Sections of chips that are powered but not clocked may be able
|
|
||||||
to retain their last state.
|
|
||||||
This low power state is often called a <emphasis>retention
|
|
||||||
mode</emphasis>.
|
|
||||||
This mode still incurs leakage currents, especially with finer
|
|
||||||
circuit geometries, but for CMOS circuits power is mostly used
|
|
||||||
by clocked state changes.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Power-aware drivers only enable their clocks when the device
|
|
||||||
they manage is in active use. Also, system sleep states often
|
|
||||||
differ according to which clock domains are active: while a
|
|
||||||
"standby" state may allow wakeup from several active domains, a
|
|
||||||
"mem" (suspend-to-RAM) state may require a more wholesale shutdown
|
|
||||||
of clocks derived from higher speed PLLs and oscillators, limiting
|
|
||||||
the number of possible wakeup event sources. A driver's suspend
|
|
||||||
method may need to be aware of system-specific clock constraints
|
|
||||||
on the target sleep state.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
Some platforms support programmable clock generators. These
|
|
||||||
can be used by external chips of various kinds, such as other
|
|
||||||
CPUs, multimedia codecs, and devices with strict requirements
|
|
||||||
for interface clocking.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
!Iinclude/linux/clk.h
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
</book>
|
|
@@ -128,9 +128,6 @@
|
|||||||
</sect1>
|
</sect1>
|
||||||
<sect1 id="Device_model_support"><title>Device model support</title>
|
<sect1 id="Device_model_support"><title>Device model support</title>
|
||||||
!Idrivers/rapidio/rio-driver.c
|
!Idrivers/rapidio/rio-driver.c
|
||||||
</sect1>
|
|
||||||
<sect1 id="Sysfs_support"><title>Sysfs support</title>
|
|
||||||
!Idrivers/rapidio/rio-sysfs.c
|
|
||||||
</sect1>
|
</sect1>
|
||||||
<sect1 id="PPC32_support"><title>PPC32 support</title>
|
<sect1 id="PPC32_support"><title>PPC32 support</title>
|
||||||
!Iarch/powerpc/sysdev/fsl_rio.c
|
!Iarch/powerpc/sysdev/fsl_rio.c
|
||||||
|
@@ -1,873 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
|
||||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
|
||||||
|
|
||||||
<book id="Writing-MUSB-Glue-Layer">
|
|
||||||
<bookinfo>
|
|
||||||
<title>Writing an MUSB Glue Layer</title>
|
|
||||||
|
|
||||||
<authorgroup>
|
|
||||||
<author>
|
|
||||||
<firstname>Apelete</firstname>
|
|
||||||
<surname>Seketeli</surname>
|
|
||||||
<affiliation>
|
|
||||||
<address>
|
|
||||||
<email>apelete at seketeli.net</email>
|
|
||||||
</address>
|
|
||||||
</affiliation>
|
|
||||||
</author>
|
|
||||||
</authorgroup>
|
|
||||||
|
|
||||||
<copyright>
|
|
||||||
<year>2014</year>
|
|
||||||
<holder>Apelete Seketeli</holder>
|
|
||||||
</copyright>
|
|
||||||
|
|
||||||
<legalnotice>
|
|
||||||
<para>
|
|
||||||
This documentation is free software; you can redistribute it
|
|
||||||
and/or modify it under the terms of the GNU General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2 of the License, or (at your option) any later version.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This documentation is distributed in the hope that it will be
|
|
||||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
||||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
||||||
See the GNU General Public License for more details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this documentation; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
||||||
02111-1307 USA
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
For more details see the file COPYING in the Linux kernel source
|
|
||||||
tree.
|
|
||||||
</para>
|
|
||||||
</legalnotice>
|
|
||||||
</bookinfo>
|
|
||||||
|
|
||||||
<toc></toc>
|
|
||||||
|
|
||||||
<chapter id="introduction">
|
|
||||||
<title>Introduction</title>
|
|
||||||
<para>
|
|
||||||
The Linux MUSB subsystem is part of the larger Linux USB
|
|
||||||
subsystem. It provides support for embedded USB Device Controllers
|
|
||||||
(UDC) that do not use Universal Host Controller Interface (UHCI)
|
|
||||||
or Open Host Controller Interface (OHCI).
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Instead, these embedded UDC rely on the USB On-the-Go (OTG)
|
|
||||||
specification which they implement at least partially. The silicon
|
|
||||||
reference design used in most cases is the Multipoint USB
|
|
||||||
Highspeed Dual-Role Controller (MUSB HDRC) found in the Mentor
|
|
||||||
Graphics Inventra™ design.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
As a self-taught exercise I have written an MUSB glue layer for
|
|
||||||
the Ingenic JZ4740 SoC, modelled after the many MUSB glue layers
|
|
||||||
in the kernel source tree. This layer can be found at
|
|
||||||
drivers/usb/musb/jz4740.c. In this documentation I will walk
|
|
||||||
through the basics of the jz4740.c glue layer, explaining the
|
|
||||||
different pieces and what needs to be done in order to write your
|
|
||||||
own device glue layer.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="linux-musb-basics">
|
|
||||||
<title>Linux MUSB Basics</title>
|
|
||||||
<para>
|
|
||||||
To get started on the topic, please read USB On-the-Go Basics (see
|
|
||||||
Resources) which provides an introduction of USB OTG operation at
|
|
||||||
the hardware level. A couple of wiki pages by Texas Instruments
|
|
||||||
and Analog Devices also provide an overview of the Linux kernel
|
|
||||||
MUSB configuration, albeit focused on some specific devices
|
|
||||||
provided by these companies. Finally, getting acquainted with the
|
|
||||||
USB specification at USB home page may come in handy, with
|
|
||||||
practical instance provided through the Writing USB Device Drivers
|
|
||||||
documentation (again, see Resources).
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Linux USB stack is a layered architecture in which the MUSB
|
|
||||||
controller hardware sits at the lowest. The MUSB controller driver
|
|
||||||
abstract the MUSB controller hardware to the Linux USB stack.
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
------------------------
|
|
||||||
| | <------- drivers/usb/gadget
|
|
||||||
| Linux USB Core Stack | <------- drivers/usb/host
|
|
||||||
| | <------- drivers/usb/core
|
|
||||||
------------------------
|
|
||||||
⬍
|
|
||||||
--------------------------
|
|
||||||
| | <------ drivers/usb/musb/musb_gadget.c
|
|
||||||
| MUSB Controller driver | <------ drivers/usb/musb/musb_host.c
|
|
||||||
| | <------ drivers/usb/musb/musb_core.c
|
|
||||||
--------------------------
|
|
||||||
⬍
|
|
||||||
---------------------------------
|
|
||||||
| MUSB Platform Specific Driver |
|
|
||||||
| | <-- drivers/usb/musb/jz4740.c
|
|
||||||
| aka "Glue Layer" |
|
|
||||||
---------------------------------
|
|
||||||
⬍
|
|
||||||
---------------------------------
|
|
||||||
| MUSB Controller Hardware |
|
|
||||||
---------------------------------
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
As outlined above, the glue layer is actually the platform
|
|
||||||
specific code sitting in between the controller driver and the
|
|
||||||
controller hardware.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Just like a Linux USB driver needs to register itself with the
|
|
||||||
Linux USB subsystem, the MUSB glue layer needs first to register
|
|
||||||
itself with the MUSB controller driver. This will allow the
|
|
||||||
controller driver to know about which device the glue layer
|
|
||||||
supports and which functions to call when a supported device is
|
|
||||||
detected or released; remember we are talking about an embedded
|
|
||||||
controller chip here, so no insertion or removal at run-time.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
All of this information is passed to the MUSB controller driver
|
|
||||||
through a platform_driver structure defined in the glue layer as:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static struct platform_driver jz4740_driver = {
|
|
||||||
.probe = jz4740_probe,
|
|
||||||
.remove = jz4740_remove,
|
|
||||||
.driver = {
|
|
||||||
.name = "musb-jz4740",
|
|
||||||
},
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The probe and remove function pointers are called when a matching
|
|
||||||
device is detected and, respectively, released. The name string
|
|
||||||
describes the device supported by this glue layer. In the current
|
|
||||||
case it matches a platform_device structure declared in
|
|
||||||
arch/mips/jz4740/platform.c. Note that we are not using device
|
|
||||||
tree bindings here.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
In order to register itself to the controller driver, the glue
|
|
||||||
layer goes through a few steps, basically allocating the
|
|
||||||
controller hardware resources and initialising a couple of
|
|
||||||
circuits. To do so, it needs to keep track of the information used
|
|
||||||
throughout these steps. This is done by defining a private
|
|
||||||
jz4740_glue structure:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
struct jz4740_glue {
|
|
||||||
struct device *dev;
|
|
||||||
struct platform_device *musb;
|
|
||||||
struct clk *clk;
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The dev and musb members are both device structure variables. The
|
|
||||||
first one holds generic information about the device, since it's
|
|
||||||
the basic device structure, and the latter holds information more
|
|
||||||
closely related to the subsystem the device is registered to. The
|
|
||||||
clk variable keeps information related to the device clock
|
|
||||||
operation.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Let's go through the steps of the probe function that leads the
|
|
||||||
glue layer to register itself to the controller driver.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
N.B.: For the sake of readability each function will be split in
|
|
||||||
logical parts, each part being shown as if it was independent from
|
|
||||||
the others.
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_probe(struct platform_device *pdev)
|
|
||||||
{
|
|
||||||
struct platform_device *musb;
|
|
||||||
struct jz4740_glue *glue;
|
|
||||||
struct clk *clk;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
glue = devm_kzalloc(&pdev->dev, sizeof(*glue), GFP_KERNEL);
|
|
||||||
if (!glue)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
|
|
||||||
if (!musb) {
|
|
||||||
dev_err(&pdev->dev, "failed to allocate musb device\n");
|
|
||||||
return -ENOMEM;
|
|
||||||
}
|
|
||||||
|
|
||||||
clk = devm_clk_get(&pdev->dev, "udc");
|
|
||||||
if (IS_ERR(clk)) {
|
|
||||||
dev_err(&pdev->dev, "failed to get clock\n");
|
|
||||||
ret = PTR_ERR(clk);
|
|
||||||
goto err_platform_device_put;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = clk_prepare_enable(clk);
|
|
||||||
if (ret) {
|
|
||||||
dev_err(&pdev->dev, "failed to enable clock\n");
|
|
||||||
goto err_platform_device_put;
|
|
||||||
}
|
|
||||||
|
|
||||||
musb->dev.parent = &pdev->dev;
|
|
||||||
|
|
||||||
glue->dev = &pdev->dev;
|
|
||||||
glue->musb = musb;
|
|
||||||
glue->clk = clk;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_platform_device_put:
|
|
||||||
platform_device_put(musb);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The first few lines of the probe function allocate and assign the
|
|
||||||
glue, musb and clk variables. The GFP_KERNEL flag (line 8) allows
|
|
||||||
the allocation process to sleep and wait for memory, thus being
|
|
||||||
usable in a blocking situation. The PLATFORM_DEVID_AUTO flag (line
|
|
||||||
12) allows automatic allocation and management of device IDs in
|
|
||||||
order to avoid device namespace collisions with explicit IDs. With
|
|
||||||
devm_clk_get() (line 18) the glue layer allocates the clock -- the
|
|
||||||
<literal>devm_</literal> prefix indicates that clk_get() is
|
|
||||||
managed: it automatically frees the allocated clock resource data
|
|
||||||
when the device is released -- and enable it.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Then comes the registration steps:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_probe(struct platform_device *pdev)
|
|
||||||
{
|
|
||||||
struct musb_hdrc_platform_data *pdata = &jz4740_musb_platform_data;
|
|
||||||
|
|
||||||
pdata->platform_ops = &jz4740_musb_ops;
|
|
||||||
|
|
||||||
platform_set_drvdata(pdev, glue);
|
|
||||||
|
|
||||||
ret = platform_device_add_resources(musb, pdev->resource,
|
|
||||||
pdev->num_resources);
|
|
||||||
if (ret) {
|
|
||||||
dev_err(&pdev->dev, "failed to add resources\n");
|
|
||||||
goto err_clk_disable;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
|
|
||||||
if (ret) {
|
|
||||||
dev_err(&pdev->dev, "failed to add platform_data\n");
|
|
||||||
goto err_clk_disable;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_clk_disable:
|
|
||||||
clk_disable_unprepare(clk);
|
|
||||||
err_platform_device_put:
|
|
||||||
platform_device_put(musb);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The first step is to pass the device data privately held by the
|
|
||||||
glue layer on to the controller driver through
|
|
||||||
platform_set_drvdata() (line 7). Next is passing on the device
|
|
||||||
resources information, also privately held at that point, through
|
|
||||||
platform_device_add_resources() (line 9).
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Finally comes passing on the platform specific data to the
|
|
||||||
controller driver (line 16). Platform data will be discussed in
|
|
||||||
<link linkend="device-platform-data">Chapter 4</link>, but here
|
|
||||||
we are looking at the platform_ops function pointer (line 5) in
|
|
||||||
musb_hdrc_platform_data structure (line 3). This function
|
|
||||||
pointer allows the MUSB controller driver to know which function
|
|
||||||
to call for device operation:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static const struct musb_platform_ops jz4740_musb_ops = {
|
|
||||||
.init = jz4740_musb_init,
|
|
||||||
.exit = jz4740_musb_exit,
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Here we have the minimal case where only init and exit functions
|
|
||||||
are called by the controller driver when needed. Fact is the
|
|
||||||
JZ4740 MUSB controller is a basic controller, lacking some
|
|
||||||
features found in other controllers, otherwise we may also have
|
|
||||||
pointers to a few other functions like a power management function
|
|
||||||
or a function to switch between OTG and non-OTG modes, for
|
|
||||||
instance.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
At that point of the registration process, the controller driver
|
|
||||||
actually calls the init function:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_musb_init(struct musb *musb)
|
|
||||||
{
|
|
||||||
musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2);
|
|
||||||
if (!musb->xceiv) {
|
|
||||||
pr_err("HS UDC: no transceiver configured\n");
|
|
||||||
return -ENODEV;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Silicon does not implement ConfigData register.
|
|
||||||
* Set dyn_fifo to avoid reading EP config from hardware.
|
|
||||||
*/
|
|
||||||
musb->dyn_fifo = true;
|
|
||||||
|
|
||||||
musb->isr = jz4740_musb_interrupt;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The goal of jz4740_musb_init() is to get hold of the transceiver
|
|
||||||
driver data of the MUSB controller hardware and pass it on to the
|
|
||||||
MUSB controller driver, as usual. The transceiver is the circuitry
|
|
||||||
inside the controller hardware responsible for sending/receiving
|
|
||||||
the USB data. Since it is an implementation of the physical layer
|
|
||||||
of the OSI model, the transceiver is also referred to as PHY.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Getting hold of the MUSB PHY driver data is done with
|
|
||||||
usb_get_phy() which returns a pointer to the structure
|
|
||||||
containing the driver instance data. The next couple of
|
|
||||||
instructions (line 12 and 14) are used as a quirk and to setup
|
|
||||||
IRQ handling respectively. Quirks and IRQ handling will be
|
|
||||||
discussed later in <link linkend="device-quirks">Chapter
|
|
||||||
5</link> and <link linkend="handling-irqs">Chapter 3</link>.
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_musb_exit(struct musb *musb)
|
|
||||||
{
|
|
||||||
usb_put_phy(musb->xceiv);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Acting as the counterpart of init, the exit function releases the
|
|
||||||
MUSB PHY driver when the controller hardware itself is about to be
|
|
||||||
released.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Again, note that init and exit are fairly simple in this case due
|
|
||||||
to the basic set of features of the JZ4740 controller hardware.
|
|
||||||
When writing an musb glue layer for a more complex controller
|
|
||||||
hardware, you might need to take care of more processing in those
|
|
||||||
two functions.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Returning from the init function, the MUSB controller driver jumps
|
|
||||||
back into the probe function:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_probe(struct platform_device *pdev)
|
|
||||||
{
|
|
||||||
ret = platform_device_add(musb);
|
|
||||||
if (ret) {
|
|
||||||
dev_err(&pdev->dev, "failed to register musb device\n");
|
|
||||||
goto err_clk_disable;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
err_clk_disable:
|
|
||||||
clk_disable_unprepare(clk);
|
|
||||||
err_platform_device_put:
|
|
||||||
platform_device_put(musb);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
This is the last part of the device registration process where the
|
|
||||||
glue layer adds the controller hardware device to Linux kernel
|
|
||||||
device hierarchy: at this stage, all known information about the
|
|
||||||
device is passed on to the Linux USB core stack.
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_remove(struct platform_device *pdev)
|
|
||||||
{
|
|
||||||
struct jz4740_glue *glue = platform_get_drvdata(pdev);
|
|
||||||
|
|
||||||
platform_device_unregister(glue->musb);
|
|
||||||
clk_disable_unprepare(glue->clk);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Acting as the counterpart of probe, the remove function unregister
|
|
||||||
the MUSB controller hardware (line 5) and disable the clock (line
|
|
||||||
6), allowing it to be gated.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="handling-irqs">
|
|
||||||
<title>Handling IRQs</title>
|
|
||||||
<para>
|
|
||||||
Additionally to the MUSB controller hardware basic setup and
|
|
||||||
registration, the glue layer is also responsible for handling the
|
|
||||||
IRQs:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
irqreturn_t retval = IRQ_NONE;
|
|
||||||
struct musb *musb = __hci;
|
|
||||||
|
|
||||||
spin_lock_irqsave(&musb->lock, flags);
|
|
||||||
|
|
||||||
musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB);
|
|
||||||
musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX);
|
|
||||||
musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The controller is gadget only, the state of the host mode IRQ bits is
|
|
||||||
* undefined. Mask them to make sure that the musb driver core will
|
|
||||||
* never see them set
|
|
||||||
*/
|
|
||||||
musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME |
|
|
||||||
MUSB_INTR_RESET | MUSB_INTR_SOF;
|
|
||||||
|
|
||||||
if (musb->int_usb || musb->int_tx || musb->int_rx)
|
|
||||||
retval = musb_interrupt(musb);
|
|
||||||
|
|
||||||
spin_unlock_irqrestore(&musb->lock, flags);
|
|
||||||
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Here the glue layer mostly has to read the relevant hardware
|
|
||||||
registers and pass their values on to the controller driver which
|
|
||||||
will handle the actual event that triggered the IRQ.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The interrupt handler critical section is protected by the
|
|
||||||
spin_lock_irqsave() and counterpart spin_unlock_irqrestore()
|
|
||||||
functions (line 7 and 24 respectively), which prevent the
|
|
||||||
interrupt handler code to be run by two different threads at the
|
|
||||||
same time.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Then the relevant interrupt registers are read (line 9 to 11):
|
|
||||||
</para>
|
|
||||||
<itemizedlist>
|
|
||||||
<listitem>
|
|
||||||
<para>
|
|
||||||
MUSB_INTRUSB: indicates which USB interrupts are currently
|
|
||||||
active,
|
|
||||||
</para>
|
|
||||||
</listitem>
|
|
||||||
<listitem>
|
|
||||||
<para>
|
|
||||||
MUSB_INTRTX: indicates which of the interrupts for TX
|
|
||||||
endpoints are currently active,
|
|
||||||
</para>
|
|
||||||
</listitem>
|
|
||||||
<listitem>
|
|
||||||
<para>
|
|
||||||
MUSB_INTRRX: indicates which of the interrupts for TX
|
|
||||||
endpoints are currently active.
|
|
||||||
</para>
|
|
||||||
</listitem>
|
|
||||||
</itemizedlist>
|
|
||||||
<para>
|
|
||||||
Note that musb_readb() is used to read 8-bit registers at most,
|
|
||||||
while musb_readw() allows us to read at most 16-bit registers.
|
|
||||||
There are other functions that can be used depending on the size
|
|
||||||
of your device registers. See musb_io.h for more information.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Instruction on line 18 is another quirk specific to the JZ4740
|
|
||||||
USB device controller, which will be discussed later in <link
|
|
||||||
linkend="device-quirks">Chapter 5</link>.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The glue layer still needs to register the IRQ handler though.
|
|
||||||
Remember the instruction on line 14 of the init function:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_musb_init(struct musb *musb)
|
|
||||||
{
|
|
||||||
musb->isr = jz4740_musb_interrupt;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
This instruction sets a pointer to the glue layer IRQ handler
|
|
||||||
function, in order for the controller hardware to call the handler
|
|
||||||
back when an IRQ comes from the controller hardware. The interrupt
|
|
||||||
handler is now implemented and registered.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="device-platform-data">
|
|
||||||
<title>Device Platform Data</title>
|
|
||||||
<para>
|
|
||||||
In order to write an MUSB glue layer, you need to have some data
|
|
||||||
describing the hardware capabilities of your controller hardware,
|
|
||||||
which is called the platform data.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Platform data is specific to your hardware, though it may cover a
|
|
||||||
broad range of devices, and is generally found somewhere in the
|
|
||||||
arch/ directory, depending on your device architecture.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
For instance, platform data for the JZ4740 SoC is found in
|
|
||||||
arch/mips/jz4740/platform.c. In the platform.c file each device of
|
|
||||||
the JZ4740 SoC is described through a set of structures.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Here is the part of arch/mips/jz4740/platform.c that covers the
|
|
||||||
USB Device Controller (UDC):
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
/* USB Device Controller */
|
|
||||||
struct platform_device jz4740_udc_xceiv_device = {
|
|
||||||
.name = "usb_phy_gen_xceiv",
|
|
||||||
.id = 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct resource jz4740_udc_resources[] = {
|
|
||||||
[0] = {
|
|
||||||
.start = JZ4740_UDC_BASE_ADDR,
|
|
||||||
.end = JZ4740_UDC_BASE_ADDR + 0x10000 - 1,
|
|
||||||
.flags = IORESOURCE_MEM,
|
|
||||||
},
|
|
||||||
[1] = {
|
|
||||||
.start = JZ4740_IRQ_UDC,
|
|
||||||
.end = JZ4740_IRQ_UDC,
|
|
||||||
.flags = IORESOURCE_IRQ,
|
|
||||||
.name = "mc",
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
struct platform_device jz4740_udc_device = {
|
|
||||||
.name = "musb-jz4740",
|
|
||||||
.id = -1,
|
|
||||||
.dev = {
|
|
||||||
.dma_mask = &jz4740_udc_device.dev.coherent_dma_mask,
|
|
||||||
.coherent_dma_mask = DMA_BIT_MASK(32),
|
|
||||||
},
|
|
||||||
.num_resources = ARRAY_SIZE(jz4740_udc_resources),
|
|
||||||
.resource = jz4740_udc_resources,
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The jz4740_udc_xceiv_device platform device structure (line 2)
|
|
||||||
describes the UDC transceiver with a name and id number.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
At the time of this writing, note that
|
|
||||||
"usb_phy_gen_xceiv" is the specific name to be used for
|
|
||||||
all transceivers that are either built-in with reference USB IP or
|
|
||||||
autonomous and doesn't require any PHY programming. You will need
|
|
||||||
to set CONFIG_NOP_USB_XCEIV=y in the kernel configuration to make
|
|
||||||
use of the corresponding transceiver driver. The id field could be
|
|
||||||
set to -1 (equivalent to PLATFORM_DEVID_NONE), -2 (equivalent to
|
|
||||||
PLATFORM_DEVID_AUTO) or start with 0 for the first device of this
|
|
||||||
kind if we want a specific id number.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The jz4740_udc_resources resource structure (line 7) defines the
|
|
||||||
UDC registers base addresses.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The first array (line 9 to 11) defines the UDC registers base
|
|
||||||
memory addresses: start points to the first register memory
|
|
||||||
address, end points to the last register memory address and the
|
|
||||||
flags member defines the type of resource we are dealing with. So
|
|
||||||
IORESOURCE_MEM is used to define the registers memory addresses.
|
|
||||||
The second array (line 14 to 17) defines the UDC IRQ registers
|
|
||||||
addresses. Since there is only one IRQ register available for the
|
|
||||||
JZ4740 UDC, start and end point at the same address. The
|
|
||||||
IORESOURCE_IRQ flag tells that we are dealing with IRQ resources,
|
|
||||||
and the name "mc" is in fact hard-coded in the MUSB core
|
|
||||||
in order for the controller driver to retrieve this IRQ resource
|
|
||||||
by querying it by its name.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Finally, the jz4740_udc_device platform device structure (line 21)
|
|
||||||
describes the UDC itself.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The "musb-jz4740" name (line 22) defines the MUSB
|
|
||||||
driver that is used for this device; remember this is in fact
|
|
||||||
the name that we used in the jz4740_driver platform driver
|
|
||||||
structure in <link linkend="linux-musb-basics">Chapter
|
|
||||||
2</link>. The id field (line 23) is set to -1 (equivalent to
|
|
||||||
PLATFORM_DEVID_NONE) since we do not need an id for the device:
|
|
||||||
the MUSB controller driver was already set to allocate an
|
|
||||||
automatic id in <link linkend="linux-musb-basics">Chapter
|
|
||||||
2</link>. In the dev field we care for DMA related information
|
|
||||||
here. The dma_mask field (line 25) defines the width of the DMA
|
|
||||||
mask that is going to be used, and coherent_dma_mask (line 26)
|
|
||||||
has the same purpose but for the alloc_coherent DMA mappings: in
|
|
||||||
both cases we are using a 32 bits mask. Then the resource field
|
|
||||||
(line 29) is simply a pointer to the resource structure defined
|
|
||||||
before, while the num_resources field (line 28) keeps track of
|
|
||||||
the number of arrays defined in the resource structure (in this
|
|
||||||
case there were two resource arrays defined before).
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
With this quick overview of the UDC platform data at the arch/
|
|
||||||
level now done, let's get back to the MUSB glue layer specific
|
|
||||||
platform data in drivers/usb/musb/jz4740.c:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static struct musb_hdrc_config jz4740_musb_config = {
|
|
||||||
/* Silicon does not implement USB OTG. */
|
|
||||||
.multipoint = 0,
|
|
||||||
/* Max EPs scanned, driver will decide which EP can be used. */
|
|
||||||
.num_eps = 4,
|
|
||||||
/* RAMbits needed to configure EPs from table */
|
|
||||||
.ram_bits = 9,
|
|
||||||
.fifo_cfg = jz4740_musb_fifo_cfg,
|
|
||||||
.fifo_cfg_size = ARRAY_SIZE(jz4740_musb_fifo_cfg),
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct musb_hdrc_platform_data jz4740_musb_platform_data = {
|
|
||||||
.mode = MUSB_PERIPHERAL,
|
|
||||||
.config = &jz4740_musb_config,
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
First the glue layer configures some aspects of the controller
|
|
||||||
driver operation related to the controller hardware specifics.
|
|
||||||
This is done through the jz4740_musb_config musb_hdrc_config
|
|
||||||
structure.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Defining the OTG capability of the controller hardware, the
|
|
||||||
multipoint member (line 3) is set to 0 (equivalent to false)
|
|
||||||
since the JZ4740 UDC is not OTG compatible. Then num_eps (line
|
|
||||||
5) defines the number of USB endpoints of the controller
|
|
||||||
hardware, including endpoint 0: here we have 3 endpoints +
|
|
||||||
endpoint 0. Next is ram_bits (line 7) which is the width of the
|
|
||||||
RAM address bus for the MUSB controller hardware. This
|
|
||||||
information is needed when the controller driver cannot
|
|
||||||
automatically configure endpoints by reading the relevant
|
|
||||||
controller hardware registers. This issue will be discussed when
|
|
||||||
we get to device quirks in <link linkend="device-quirks">Chapter
|
|
||||||
5</link>. Last two fields (line 8 and 9) are also about device
|
|
||||||
quirks: fifo_cfg points to the USB endpoints configuration table
|
|
||||||
and fifo_cfg_size keeps track of the size of the number of
|
|
||||||
entries in that configuration table. More on that later in <link
|
|
||||||
linkend="device-quirks">Chapter 5</link>.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Then this configuration is embedded inside
|
|
||||||
jz4740_musb_platform_data musb_hdrc_platform_data structure (line
|
|
||||||
11): config is a pointer to the configuration structure itself,
|
|
||||||
and mode tells the controller driver if the controller hardware
|
|
||||||
may be used as MUSB_HOST only, MUSB_PERIPHERAL only or MUSB_OTG
|
|
||||||
which is a dual mode.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Remember that jz4740_musb_platform_data is then used to convey
|
|
||||||
platform data information as we have seen in the probe function
|
|
||||||
in <link linkend="linux-musb-basics">Chapter 2</link>
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="device-quirks">
|
|
||||||
<title>Device Quirks</title>
|
|
||||||
<para>
|
|
||||||
Completing the platform data specific to your device, you may also
|
|
||||||
need to write some code in the glue layer to work around some
|
|
||||||
device specific limitations. These quirks may be due to some
|
|
||||||
hardware bugs, or simply be the result of an incomplete
|
|
||||||
implementation of the USB On-the-Go specification.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The JZ4740 UDC exhibits such quirks, some of which we will discuss
|
|
||||||
here for the sake of insight even though these might not be found
|
|
||||||
in the controller hardware you are working on.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Let's get back to the init function first:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static int jz4740_musb_init(struct musb *musb)
|
|
||||||
{
|
|
||||||
musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2);
|
|
||||||
if (!musb->xceiv) {
|
|
||||||
pr_err("HS UDC: no transceiver configured\n");
|
|
||||||
return -ENODEV;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Silicon does not implement ConfigData register.
|
|
||||||
* Set dyn_fifo to avoid reading EP config from hardware.
|
|
||||||
*/
|
|
||||||
musb->dyn_fifo = true;
|
|
||||||
|
|
||||||
musb->isr = jz4740_musb_interrupt;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Instruction on line 12 helps the MUSB controller driver to work
|
|
||||||
around the fact that the controller hardware is missing registers
|
|
||||||
that are used for USB endpoints configuration.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Without these registers, the controller driver is unable to read
|
|
||||||
the endpoints configuration from the hardware, so we use line 12
|
|
||||||
instruction to bypass reading the configuration from silicon, and
|
|
||||||
rely on a hard-coded table that describes the endpoints
|
|
||||||
configuration instead:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = {
|
|
||||||
{ .hw_ep_num = 1, .style = FIFO_TX, .maxpacket = 512, },
|
|
||||||
{ .hw_ep_num = 1, .style = FIFO_RX, .maxpacket = 512, },
|
|
||||||
{ .hw_ep_num = 2, .style = FIFO_TX, .maxpacket = 64, },
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Looking at the configuration table above, we see that each
|
|
||||||
endpoints is described by three fields: hw_ep_num is the endpoint
|
|
||||||
number, style is its direction (either FIFO_TX for the controller
|
|
||||||
driver to send packets in the controller hardware, or FIFO_RX to
|
|
||||||
receive packets from hardware), and maxpacket defines the maximum
|
|
||||||
size of each data packet that can be transmitted over that
|
|
||||||
endpoint. Reading from the table, the controller driver knows that
|
|
||||||
endpoint 1 can be used to send and receive USB data packets of 512
|
|
||||||
bytes at once (this is in fact a bulk in/out endpoint), and
|
|
||||||
endpoint 2 can be used to send data packets of 64 bytes at once
|
|
||||||
(this is in fact an interrupt endpoint).
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Note that there is no information about endpoint 0 here: that one
|
|
||||||
is implemented by default in every silicon design, with a
|
|
||||||
predefined configuration according to the USB specification. For
|
|
||||||
more examples of endpoint configuration tables, see musb_core.c.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Let's now get back to the interrupt handler function:
|
|
||||||
</para>
|
|
||||||
<programlisting linenumbering="numbered">
|
|
||||||
static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
irqreturn_t retval = IRQ_NONE;
|
|
||||||
struct musb *musb = __hci;
|
|
||||||
|
|
||||||
spin_lock_irqsave(&musb->lock, flags);
|
|
||||||
|
|
||||||
musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB);
|
|
||||||
musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX);
|
|
||||||
musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The controller is gadget only, the state of the host mode IRQ bits is
|
|
||||||
* undefined. Mask them to make sure that the musb driver core will
|
|
||||||
* never see them set
|
|
||||||
*/
|
|
||||||
musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME |
|
|
||||||
MUSB_INTR_RESET | MUSB_INTR_SOF;
|
|
||||||
|
|
||||||
if (musb->int_usb || musb->int_tx || musb->int_rx)
|
|
||||||
retval = musb_interrupt(musb);
|
|
||||||
|
|
||||||
spin_unlock_irqrestore(&musb->lock, flags);
|
|
||||||
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
Instruction on line 18 above is a way for the controller driver to
|
|
||||||
work around the fact that some interrupt bits used for USB host
|
|
||||||
mode operation are missing in the MUSB_INTRUSB register, thus left
|
|
||||||
in an undefined hardware state, since this MUSB controller
|
|
||||||
hardware is used in peripheral mode only. As a consequence, the
|
|
||||||
glue layer masks these missing bits out to avoid parasite
|
|
||||||
interrupts by doing a logical AND operation between the value read
|
|
||||||
from MUSB_INTRUSB and the bits that are actually implemented in
|
|
||||||
the register.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
These are only a couple of the quirks found in the JZ4740 USB
|
|
||||||
device controller. Some others were directly addressed in the MUSB
|
|
||||||
core since the fixes were generic enough to provide a better
|
|
||||||
handling of the issues for others controller hardware eventually.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="conclusion">
|
|
||||||
<title>Conclusion</title>
|
|
||||||
<para>
|
|
||||||
Writing a Linux MUSB glue layer should be a more accessible task,
|
|
||||||
as this documentation tries to show the ins and outs of this
|
|
||||||
exercise.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The JZ4740 USB device controller being fairly simple, I hope its
|
|
||||||
glue layer serves as a good example for the curious mind. Used
|
|
||||||
with the current MUSB glue layers, this documentation should
|
|
||||||
provide enough guidance to get started; should anything gets out
|
|
||||||
of hand, the linux-usb mailing list archive is another helpful
|
|
||||||
resource to browse through.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="acknowledgements">
|
|
||||||
<title>Acknowledgements</title>
|
|
||||||
<para>
|
|
||||||
Many thanks to Lars-Peter Clausen and Maarten ter Huurne for
|
|
||||||
answering my questions while I was writing the JZ4740 glue layer
|
|
||||||
and for helping me out getting the code in good shape.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
I would also like to thank the Qi-Hardware community at large for
|
|
||||||
its cheerful guidance and support.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="resources">
|
|
||||||
<title>Resources</title>
|
|
||||||
<para>
|
|
||||||
USB Home Page:
|
|
||||||
<ulink url="http://www.usb.org">http://www.usb.org</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
linux-usb Mailing List Archives:
|
|
||||||
<ulink url="http://marc.info/?l=linux-usb">http://marc.info/?l=linux-usb</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
USB On-the-Go Basics:
|
|
||||||
<ulink url="http://www.maximintegrated.com/app-notes/index.mvp/id/1822">http://www.maximintegrated.com/app-notes/index.mvp/id/1822</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Writing USB Device Drivers:
|
|
||||||
<ulink url="https://www.kernel.org/doc/htmldocs/writing_usb_driver/index.html">https://www.kernel.org/doc/htmldocs/writing_usb_driver/index.html</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Texas Instruments USB Configuration Wiki Page:
|
|
||||||
<ulink url="http://processors.wiki.ti.com/index.php/Usbgeneralpage">http://processors.wiki.ti.com/index.php/Usbgeneralpage</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Analog Devices Blackfin MUSB Configuration:
|
|
||||||
<ulink url="http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb">http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb</ulink>
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
</book>
|
|
@@ -1,412 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
|
|
||||||
"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
|
|
||||||
|
|
||||||
<book id="USBDeviceDriver">
|
|
||||||
<bookinfo>
|
|
||||||
<title>Writing USB Device Drivers</title>
|
|
||||||
|
|
||||||
<authorgroup>
|
|
||||||
<author>
|
|
||||||
<firstname>Greg</firstname>
|
|
||||||
<surname>Kroah-Hartman</surname>
|
|
||||||
<affiliation>
|
|
||||||
<address>
|
|
||||||
<email>greg@kroah.com</email>
|
|
||||||
</address>
|
|
||||||
</affiliation>
|
|
||||||
</author>
|
|
||||||
</authorgroup>
|
|
||||||
|
|
||||||
<copyright>
|
|
||||||
<year>2001-2002</year>
|
|
||||||
<holder>Greg Kroah-Hartman</holder>
|
|
||||||
</copyright>
|
|
||||||
|
|
||||||
<legalnotice>
|
|
||||||
<para>
|
|
||||||
This documentation is free software; you can redistribute
|
|
||||||
it and/or modify it under the terms of the GNU General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2 of the License, or (at your option) any later
|
|
||||||
version.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This program is distributed in the hope that it will be
|
|
||||||
useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
||||||
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
||||||
See the GNU General Public License for more details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
You should have received a copy of the GNU General Public
|
|
||||||
License along with this program; if not, write to the Free
|
|
||||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
||||||
MA 02111-1307 USA
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
For more details see the file COPYING in the source
|
|
||||||
distribution of Linux.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
This documentation is based on an article published in
|
|
||||||
Linux Journal Magazine, October 2001, Issue 90.
|
|
||||||
</para>
|
|
||||||
</legalnotice>
|
|
||||||
</bookinfo>
|
|
||||||
|
|
||||||
<toc></toc>
|
|
||||||
|
|
||||||
<chapter id="intro">
|
|
||||||
<title>Introduction</title>
|
|
||||||
<para>
|
|
||||||
The Linux USB subsystem has grown from supporting only two different
|
|
||||||
types of devices in the 2.2.7 kernel (mice and keyboards), to over 20
|
|
||||||
different types of devices in the 2.4 kernel. Linux currently supports
|
|
||||||
almost all USB class devices (standard types of devices like keyboards,
|
|
||||||
mice, modems, printers and speakers) and an ever-growing number of
|
|
||||||
vendor-specific devices (such as USB to serial converters, digital
|
|
||||||
cameras, Ethernet devices and MP3 players). For a full list of the
|
|
||||||
different USB devices currently supported, see Resources.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The remaining kinds of USB devices that do not have support on Linux are
|
|
||||||
almost all vendor-specific devices. Each vendor decides to implement a
|
|
||||||
custom protocol to talk to their device, so a custom driver usually needs
|
|
||||||
to be created. Some vendors are open with their USB protocols and help
|
|
||||||
with the creation of Linux drivers, while others do not publish them, and
|
|
||||||
developers are forced to reverse-engineer. See Resources for some links
|
|
||||||
to handy reverse-engineering tools.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Because each different protocol causes a new driver to be created, I have
|
|
||||||
written a generic USB driver skeleton, modelled after the pci-skeleton.c
|
|
||||||
file in the kernel source tree upon which many PCI network drivers have
|
|
||||||
been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c
|
|
||||||
in the kernel source tree. In this article I will walk through the basics
|
|
||||||
of the skeleton driver, explaining the different pieces and what needs to
|
|
||||||
be done to customize it to your specific device.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="basics">
|
|
||||||
<title>Linux USB Basics</title>
|
|
||||||
<para>
|
|
||||||
If you are going to write a Linux USB driver, please become familiar with
|
|
||||||
the USB protocol specification. It can be found, along with many other
|
|
||||||
useful documents, at the USB home page (see Resources). An excellent
|
|
||||||
introduction to the Linux USB subsystem can be found at the USB Working
|
|
||||||
Devices List (see Resources). It explains how the Linux USB subsystem is
|
|
||||||
structured and introduces the reader to the concept of USB urbs
|
|
||||||
(USB Request Blocks), which are essential to USB drivers.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The first thing a Linux USB driver needs to do is register itself with
|
|
||||||
the Linux USB subsystem, giving it some information about which devices
|
|
||||||
the driver supports and which functions to call when a device supported
|
|
||||||
by the driver is inserted or removed from the system. All of this
|
|
||||||
information is passed to the USB subsystem in the usb_driver structure.
|
|
||||||
The skeleton driver declares a usb_driver as:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
static struct usb_driver skel_driver = {
|
|
||||||
.name = "skeleton",
|
|
||||||
.probe = skel_probe,
|
|
||||||
.disconnect = skel_disconnect,
|
|
||||||
.fops = &skel_fops,
|
|
||||||
.minor = USB_SKEL_MINOR_BASE,
|
|
||||||
.id_table = skel_table,
|
|
||||||
};
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The variable name is a string that describes the driver. It is used in
|
|
||||||
informational messages printed to the system log. The probe and
|
|
||||||
disconnect function pointers are called when a device that matches the
|
|
||||||
information provided in the id_table variable is either seen or removed.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The fops and minor variables are optional. Most USB drivers hook into
|
|
||||||
another kernel subsystem, such as the SCSI, network or TTY subsystem.
|
|
||||||
These types of drivers register themselves with the other kernel
|
|
||||||
subsystem, and any user-space interactions are provided through that
|
|
||||||
interface. But for drivers that do not have a matching kernel subsystem,
|
|
||||||
such as MP3 players or scanners, a method of interacting with user space
|
|
||||||
is needed. The USB subsystem provides a way to register a minor device
|
|
||||||
number and a set of file_operations function pointers that enable this
|
|
||||||
user-space interaction. The skeleton driver needs this kind of interface,
|
|
||||||
so it provides a minor starting number and a pointer to its
|
|
||||||
file_operations functions.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The USB driver is then registered with a call to usb_register, usually in
|
|
||||||
the driver's init function, as shown here:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
static int __init usb_skel_init(void)
|
|
||||||
{
|
|
||||||
int result;
|
|
||||||
|
|
||||||
/* register this driver with the USB subsystem */
|
|
||||||
result = usb_register(&skel_driver);
|
|
||||||
if (result < 0) {
|
|
||||||
err("usb_register failed for the "__FILE__ "driver."
|
|
||||||
"Error number %d", result);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
module_init(usb_skel_init);
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
When the driver is unloaded from the system, it needs to deregister
|
|
||||||
itself with the USB subsystem. This is done with the usb_deregister
|
|
||||||
function:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
static void __exit usb_skel_exit(void)
|
|
||||||
{
|
|
||||||
/* deregister this driver with the USB subsystem */
|
|
||||||
usb_deregister(&skel_driver);
|
|
||||||
}
|
|
||||||
module_exit(usb_skel_exit);
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
To enable the linux-hotplug system to load the driver automatically when
|
|
||||||
the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The
|
|
||||||
following code tells the hotplug scripts that this module supports a
|
|
||||||
single device with a specific vendor and product ID:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
/* table of devices that work with this driver */
|
|
||||||
static struct usb_device_id skel_table [] = {
|
|
||||||
{ USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
|
|
||||||
{ } /* Terminating entry */
|
|
||||||
};
|
|
||||||
MODULE_DEVICE_TABLE (usb, skel_table);
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
There are other macros that can be used in describing a usb_device_id for
|
|
||||||
drivers that support a whole class of USB drivers. See usb.h for more
|
|
||||||
information on this.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="device">
|
|
||||||
<title>Device operation</title>
|
|
||||||
<para>
|
|
||||||
When a device is plugged into the USB bus that matches the device ID
|
|
||||||
pattern that your driver registered with the USB core, the probe function
|
|
||||||
is called. The usb_device structure, interface number and the interface ID
|
|
||||||
are passed to the function:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
static int skel_probe(struct usb_interface *interface,
|
|
||||||
const struct usb_device_id *id)
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The driver now needs to verify that this device is actually one that it
|
|
||||||
can accept. If so, it returns 0.
|
|
||||||
If not, or if any error occurs during initialization, an errorcode
|
|
||||||
(such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>)
|
|
||||||
is returned from the probe function.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
In the skeleton driver, we determine what end points are marked as bulk-in
|
|
||||||
and bulk-out. We create buffers to hold the data that will be sent and
|
|
||||||
received from the device, and a USB urb to write data to the device is
|
|
||||||
initialized.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Conversely, when the device is removed from the USB bus, the disconnect
|
|
||||||
function is called with the device pointer. The driver needs to clean any
|
|
||||||
private data that has been allocated at this time and to shut down any
|
|
||||||
pending urbs that are in the USB system.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Now that the device is plugged into the system and the driver is bound to
|
|
||||||
the device, any of the functions in the file_operations structure that
|
|
||||||
were passed to the USB subsystem will be called from a user program trying
|
|
||||||
to talk to the device. The first function called will be open, as the
|
|
||||||
program tries to open the device for I/O. We increment our private usage
|
|
||||||
count and save a pointer to our internal structure in the file
|
|
||||||
structure. This is done so that future calls to file operations will
|
|
||||||
enable the driver to determine which device the user is addressing. All
|
|
||||||
of this is done with the following code:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
/* increment our usage count for the module */
|
|
||||||
++skel->open_count;
|
|
||||||
|
|
||||||
/* save our object in the file's private structure */
|
|
||||||
file->private_data = dev;
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
After the open function is called, the read and write functions are called
|
|
||||||
to receive and send data to the device. In the skel_write function, we
|
|
||||||
receive a pointer to some data that the user wants to send to the device
|
|
||||||
and the size of the data. The function determines how much data it can
|
|
||||||
send to the device based on the size of the write urb it has created (this
|
|
||||||
size depends on the size of the bulk out end point that the device has).
|
|
||||||
Then it copies the data from user space to kernel space, points the urb to
|
|
||||||
the data and submits the urb to the USB subsystem. This can be seen in
|
|
||||||
the following code:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
/* we can only write as much as 1 urb will hold */
|
|
||||||
bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count;
|
|
||||||
|
|
||||||
/* copy the data from user space into our urb */
|
|
||||||
copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written);
|
|
||||||
|
|
||||||
/* set up our urb */
|
|
||||||
usb_fill_bulk_urb(skel->write_urb,
|
|
||||||
skel->dev,
|
|
||||||
usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr),
|
|
||||||
skel->write_urb->transfer_buffer,
|
|
||||||
bytes_written,
|
|
||||||
skel_write_bulk_callback,
|
|
||||||
skel);
|
|
||||||
|
|
||||||
/* send the data out the bulk port */
|
|
||||||
result = usb_submit_urb(skel->write_urb);
|
|
||||||
if (result) {
|
|
||||||
err("Failed submitting write urb, error %d", result);
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
When the write urb is filled up with the proper information using the
|
|
||||||
usb_fill_bulk_urb function, we point the urb's completion callback to call our
|
|
||||||
own skel_write_bulk_callback function. This function is called when the
|
|
||||||
urb is finished by the USB subsystem. The callback function is called in
|
|
||||||
interrupt context, so caution must be taken not to do very much processing
|
|
||||||
at that time. Our implementation of skel_write_bulk_callback merely
|
|
||||||
reports if the urb was completed successfully or not and then returns.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The read function works a bit differently from the write function in that
|
|
||||||
we do not use an urb to transfer data from the device to the driver.
|
|
||||||
Instead we call the usb_bulk_msg function, which can be used to send or
|
|
||||||
receive data from a device without having to create urbs and handle
|
|
||||||
urb completion callback functions. We call the usb_bulk_msg function,
|
|
||||||
giving it a buffer into which to place any data received from the device
|
|
||||||
and a timeout value. If the timeout period expires without receiving any
|
|
||||||
data from the device, the function will fail and return an error message.
|
|
||||||
This can be shown with the following code:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
/* do an immediate bulk read to get data from the device */
|
|
||||||
retval = usb_bulk_msg (skel->dev,
|
|
||||||
usb_rcvbulkpipe (skel->dev,
|
|
||||||
skel->bulk_in_endpointAddr),
|
|
||||||
skel->bulk_in_buffer,
|
|
||||||
skel->bulk_in_size,
|
|
||||||
&count, HZ*10);
|
|
||||||
/* if the read was successful, copy the data to user space */
|
|
||||||
if (!retval) {
|
|
||||||
if (copy_to_user (buffer, skel->bulk_in_buffer, count))
|
|
||||||
retval = -EFAULT;
|
|
||||||
else
|
|
||||||
retval = count;
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
The usb_bulk_msg function can be very useful for doing single reads or
|
|
||||||
writes to a device; however, if you need to read or write constantly to a
|
|
||||||
device, it is recommended to set up your own urbs and submit them to the
|
|
||||||
USB subsystem.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
When the user program releases the file handle that it has been using to
|
|
||||||
talk to the device, the release function in the driver is called. In this
|
|
||||||
function we decrement our private usage count and wait for possible
|
|
||||||
pending writes:
|
|
||||||
</para>
|
|
||||||
<programlisting>
|
|
||||||
/* decrement our usage count for the device */
|
|
||||||
--skel->open_count;
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
One of the more difficult problems that USB drivers must be able to handle
|
|
||||||
smoothly is the fact that the USB device may be removed from the system at
|
|
||||||
any point in time, even if a program is currently talking to it. It needs
|
|
||||||
to be able to shut down any current reads and writes and notify the
|
|
||||||
user-space programs that the device is no longer there. The following
|
|
||||||
code (function <function>skel_delete</function>)
|
|
||||||
is an example of how to do this: </para>
|
|
||||||
<programlisting>
|
|
||||||
static inline void skel_delete (struct usb_skel *dev)
|
|
||||||
{
|
|
||||||
kfree (dev->bulk_in_buffer);
|
|
||||||
if (dev->bulk_out_buffer != NULL)
|
|
||||||
usb_free_coherent (dev->udev, dev->bulk_out_size,
|
|
||||||
dev->bulk_out_buffer,
|
|
||||||
dev->write_urb->transfer_dma);
|
|
||||||
usb_free_urb (dev->write_urb);
|
|
||||||
kfree (dev);
|
|
||||||
}
|
|
||||||
</programlisting>
|
|
||||||
<para>
|
|
||||||
If a program currently has an open handle to the device, we reset the flag
|
|
||||||
<literal>device_present</literal>. For
|
|
||||||
every read, write, release and other functions that expect a device to be
|
|
||||||
present, the driver first checks this flag to see if the device is
|
|
||||||
still present. If not, it releases that the device has disappeared, and a
|
|
||||||
-ENODEV error is returned to the user-space program. When the release
|
|
||||||
function is eventually called, it determines if there is no device
|
|
||||||
and if not, it does the cleanup that the skel_disconnect
|
|
||||||
function normally does if there are no open files on the device (see
|
|
||||||
Listing 5).
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="iso">
|
|
||||||
<title>Isochronous Data</title>
|
|
||||||
<para>
|
|
||||||
This usb-skeleton driver does not have any examples of interrupt or
|
|
||||||
isochronous data being sent to or from the device. Interrupt data is sent
|
|
||||||
almost exactly as bulk data is, with a few minor exceptions. Isochronous
|
|
||||||
data works differently with continuous streams of data being sent to or
|
|
||||||
from the device. The audio and video camera drivers are very good examples
|
|
||||||
of drivers that handle isochronous data and will be useful if you also
|
|
||||||
need to do this.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="Conclusion">
|
|
||||||
<title>Conclusion</title>
|
|
||||||
<para>
|
|
||||||
Writing Linux USB device drivers is not a difficult task as the
|
|
||||||
usb-skeleton driver shows. This driver, combined with the other current
|
|
||||||
USB drivers, should provide enough examples to help a beginning author
|
|
||||||
create a working driver in a minimal amount of time. The linux-usb-devel
|
|
||||||
mailing list archives also contain a lot of helpful information.
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
<chapter id="resources">
|
|
||||||
<title>Resources</title>
|
|
||||||
<para>
|
|
||||||
The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink>
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink>
|
|
||||||
</para>
|
|
||||||
</chapter>
|
|
||||||
|
|
||||||
</book>
|
|
@@ -59,9 +59,9 @@
|
|||||||
/* Fixed header pattern */
|
/* Fixed header pattern */
|
||||||
header: .byte 0x00,0xff,0xff,0xff,0xff,0xff,0xff,0x00
|
header: .byte 0x00,0xff,0xff,0xff,0xff,0xff,0xff,0x00
|
||||||
|
|
||||||
mfg_id: .word swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3))
|
mfg_id: .hword swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3))
|
||||||
|
|
||||||
prod_code: .word 0
|
prod_code: .hword 0
|
||||||
|
|
||||||
/* Serial number. 32 bits, little endian. */
|
/* Serial number. 32 bits, little endian. */
|
||||||
serial_number: .long SERIAL
|
serial_number: .long SERIAL
|
||||||
@@ -177,7 +177,7 @@ std_vres: .byte (XY_RATIO<<6)+VFREQ-60
|
|||||||
|
|
||||||
descriptor1:
|
descriptor1:
|
||||||
/* Pixel clock in 10 kHz units. (0.-655.35 MHz, little-endian) */
|
/* Pixel clock in 10 kHz units. (0.-655.35 MHz, little-endian) */
|
||||||
clock: .word CLOCK/10
|
clock: .hword CLOCK/10
|
||||||
|
|
||||||
/* Horizontal active pixels 8 lsbits (0-4095) */
|
/* Horizontal active pixels 8 lsbits (0-4095) */
|
||||||
x_act_lsb: .byte XPIX&0xff
|
x_act_lsb: .byte XPIX&0xff
|
||||||
|
@@ -12,3 +12,13 @@ pci.txt
|
|||||||
- info on the PCI subsystem for device driver authors
|
- info on the PCI subsystem for device driver authors
|
||||||
pcieaer-howto.txt
|
pcieaer-howto.txt
|
||||||
- the PCI Express Advanced Error Reporting Driver Guide HOWTO
|
- the PCI Express Advanced Error Reporting Driver Guide HOWTO
|
||||||
|
endpoint/pci-endpoint.txt
|
||||||
|
- guide to add endpoint controller driver and endpoint function driver.
|
||||||
|
endpoint/pci-endpoint-cfs.txt
|
||||||
|
- guide to use configfs to configure the PCI endpoint function.
|
||||||
|
endpoint/pci-test-function.txt
|
||||||
|
- specification of *PCI test* function device.
|
||||||
|
endpoint/pci-test-howto.txt
|
||||||
|
- userguide for PCI endpoint test function.
|
||||||
|
endpoint/function/binding/
|
||||||
|
- binding documentation for PCI endpoint function
|
||||||
|
17
Documentation/PCI/endpoint/function/binding/pci-test.txt
Normal file
17
Documentation/PCI/endpoint/function/binding/pci-test.txt
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
PCI TEST ENDPOINT FUNCTION
|
||||||
|
|
||||||
|
name: Should be "pci_epf_test" to bind to the pci_epf_test driver.
|
||||||
|
|
||||||
|
Configurable Fields:
|
||||||
|
vendorid : should be 0x104c
|
||||||
|
deviceid : should be 0xb500 for DRA74x and 0xb501 for DRA72x
|
||||||
|
revid : don't care
|
||||||
|
progif_code : don't care
|
||||||
|
subclass_code : don't care
|
||||||
|
baseclass_code : should be 0xff
|
||||||
|
cache_line_size : don't care
|
||||||
|
subsys_vendor_id : don't care
|
||||||
|
subsys_id : don't care
|
||||||
|
interrupt_pin : Should be 1 - INTA, 2 - INTB, 3 - INTC, 4 -INTD
|
||||||
|
msi_interrupts : Should be 1 to 32 depending on the number of MSI interrupts
|
||||||
|
to test
|
105
Documentation/PCI/endpoint/pci-endpoint-cfs.txt
Normal file
105
Documentation/PCI/endpoint/pci-endpoint-cfs.txt
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
CONFIGURING PCI ENDPOINT USING CONFIGFS
|
||||||
|
Kishon Vijay Abraham I <kishon@ti.com>
|
||||||
|
|
||||||
|
The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
|
||||||
|
PCI endpoint function and to bind the endpoint function
|
||||||
|
with the endpoint controller. (For introducing other mechanisms to
|
||||||
|
configure the PCI Endpoint Function refer to [1]).
|
||||||
|
|
||||||
|
*) Mounting configfs
|
||||||
|
|
||||||
|
The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
|
||||||
|
directory. configfs can be mounted using the following command.
|
||||||
|
|
||||||
|
mount -t configfs none /sys/kernel/config
|
||||||
|
|
||||||
|
*) Directory Structure
|
||||||
|
|
||||||
|
The pci_ep configfs has two directories at its root: controllers and
|
||||||
|
functions. Every EPC device present in the system will have an entry in
|
||||||
|
the *controllers* directory and and every EPF driver present in the system
|
||||||
|
will have an entry in the *functions* directory.
|
||||||
|
|
||||||
|
/sys/kernel/config/pci_ep/
|
||||||
|
.. controllers/
|
||||||
|
.. functions/
|
||||||
|
|
||||||
|
*) Creating EPF Device
|
||||||
|
|
||||||
|
Every registered EPF driver will be listed in controllers directory. The
|
||||||
|
entries corresponding to EPF driver will be created by the EPF core.
|
||||||
|
|
||||||
|
/sys/kernel/config/pci_ep/functions/
|
||||||
|
.. <EPF Driver1>/
|
||||||
|
... <EPF Device 11>/
|
||||||
|
... <EPF Device 21>/
|
||||||
|
.. <EPF Driver2>/
|
||||||
|
... <EPF Device 12>/
|
||||||
|
... <EPF Device 22>/
|
||||||
|
|
||||||
|
In order to create a <EPF device> of the type probed by <EPF Driver>, the
|
||||||
|
user has to create a directory inside <EPF DriverN>.
|
||||||
|
|
||||||
|
Every <EPF device> directory consists of the following entries that can be
|
||||||
|
used to configure the standard configuration header of the endpoint function.
|
||||||
|
(These entries are created by the framework when any new <EPF Device> is
|
||||||
|
created)
|
||||||
|
|
||||||
|
.. <EPF Driver1>/
|
||||||
|
... <EPF Device 11>/
|
||||||
|
... vendorid
|
||||||
|
... deviceid
|
||||||
|
... revid
|
||||||
|
... progif_code
|
||||||
|
... subclass_code
|
||||||
|
... baseclass_code
|
||||||
|
... cache_line_size
|
||||||
|
... subsys_vendor_id
|
||||||
|
... subsys_id
|
||||||
|
... interrupt_pin
|
||||||
|
|
||||||
|
*) EPC Device
|
||||||
|
|
||||||
|
Every registered EPC device will be listed in controllers directory. The
|
||||||
|
entries corresponding to EPC device will be created by the EPC core.
|
||||||
|
|
||||||
|
/sys/kernel/config/pci_ep/controllers/
|
||||||
|
.. <EPC Device1>/
|
||||||
|
... <Symlink EPF Device11>/
|
||||||
|
... <Symlink EPF Device12>/
|
||||||
|
... start
|
||||||
|
.. <EPC Device2>/
|
||||||
|
... <Symlink EPF Device21>/
|
||||||
|
... <Symlink EPF Device22>/
|
||||||
|
... start
|
||||||
|
|
||||||
|
The <EPC Device> directory will have a list of symbolic links to
|
||||||
|
<EPF Device>. These symbolic links should be created by the user to
|
||||||
|
represent the functions present in the endpoint device.
|
||||||
|
|
||||||
|
The <EPC Device> directory will also have a *start* field. Once
|
||||||
|
"1" is written to this field, the endpoint device will be ready to
|
||||||
|
establish the link with the host. This is usually done after
|
||||||
|
all the EPF devices are created and linked with the EPC device.
|
||||||
|
|
||||||
|
|
||||||
|
| controllers/
|
||||||
|
| <Directory: EPC name>/
|
||||||
|
| <Symbolic Link: Function>
|
||||||
|
| start
|
||||||
|
| functions/
|
||||||
|
| <Directory: EPF driver>/
|
||||||
|
| <Directory: EPF device>/
|
||||||
|
| vendorid
|
||||||
|
| deviceid
|
||||||
|
| revid
|
||||||
|
| progif_code
|
||||||
|
| subclass_code
|
||||||
|
| baseclass_code
|
||||||
|
| cache_line_size
|
||||||
|
| subsys_vendor_id
|
||||||
|
| subsys_id
|
||||||
|
| interrupt_pin
|
||||||
|
| function
|
||||||
|
|
||||||
|
[1] -> Documentation/PCI/endpoint/pci-endpoint.txt
|
215
Documentation/PCI/endpoint/pci-endpoint.txt
Normal file
215
Documentation/PCI/endpoint/pci-endpoint.txt
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
PCI ENDPOINT FRAMEWORK
|
||||||
|
Kishon Vijay Abraham I <kishon@ti.com>
|
||||||
|
|
||||||
|
This document is a guide to use the PCI Endpoint Framework in order to create
|
||||||
|
endpoint controller driver, endpoint function driver, and using configfs
|
||||||
|
interface to bind the function driver to the controller driver.
|
||||||
|
|
||||||
|
1. Introduction
|
||||||
|
|
||||||
|
Linux has a comprehensive PCI subsystem to support PCI controllers that
|
||||||
|
operates in Root Complex mode. The subsystem has capability to scan PCI bus,
|
||||||
|
assign memory resources and IRQ resources, load PCI driver (based on
|
||||||
|
vendor ID, device ID), support other services like hot-plug, power management,
|
||||||
|
advanced error reporting and virtual channels.
|
||||||
|
|
||||||
|
However the PCI controller IP integrated in some SoCs is capable of operating
|
||||||
|
either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
|
||||||
|
add endpoint mode support in Linux. This will help to run Linux in an
|
||||||
|
EP system which can have a wide variety of use cases from testing or
|
||||||
|
validation, co-processor accelerator, etc.
|
||||||
|
|
||||||
|
2. PCI Endpoint Core
|
||||||
|
|
||||||
|
The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
|
||||||
|
library, the Endpoint Function library, and the configfs layer to bind the
|
||||||
|
endpoint function with the endpoint controller.
|
||||||
|
|
||||||
|
2.1 PCI Endpoint Controller(EPC) Library
|
||||||
|
|
||||||
|
The EPC library provides APIs to be used by the controller that can operate
|
||||||
|
in endpoint mode. It also provides APIs to be used by function driver/library
|
||||||
|
in order to implement a particular endpoint function.
|
||||||
|
|
||||||
|
2.1.1 APIs for the PCI controller Driver
|
||||||
|
|
||||||
|
This section lists the APIs that the PCI Endpoint core provides to be used
|
||||||
|
by the PCI controller driver.
|
||||||
|
|
||||||
|
*) devm_pci_epc_create()/pci_epc_create()
|
||||||
|
|
||||||
|
The PCI controller driver should implement the following ops:
|
||||||
|
* write_header: ops to populate configuration space header
|
||||||
|
* set_bar: ops to configure the BAR
|
||||||
|
* clear_bar: ops to reset the BAR
|
||||||
|
* alloc_addr_space: ops to allocate in PCI controller address space
|
||||||
|
* free_addr_space: ops to free the allocated address space
|
||||||
|
* raise_irq: ops to raise a legacy or MSI interrupt
|
||||||
|
* start: ops to start the PCI link
|
||||||
|
* stop: ops to stop the PCI link
|
||||||
|
|
||||||
|
The PCI controller driver can then create a new EPC device by invoking
|
||||||
|
devm_pci_epc_create()/pci_epc_create().
|
||||||
|
|
||||||
|
*) devm_pci_epc_destroy()/pci_epc_destroy()
|
||||||
|
|
||||||
|
The PCI controller driver can destroy the EPC device created by either
|
||||||
|
devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
|
||||||
|
pci_epc_destroy().
|
||||||
|
|
||||||
|
*) pci_epc_linkup()
|
||||||
|
|
||||||
|
In order to notify all the function devices that the EPC device to which
|
||||||
|
they are linked has established a link with the host, the PCI controller
|
||||||
|
driver should invoke pci_epc_linkup().
|
||||||
|
|
||||||
|
*) pci_epc_mem_init()
|
||||||
|
|
||||||
|
Initialize the pci_epc_mem structure used for allocating EPC addr space.
|
||||||
|
|
||||||
|
*) pci_epc_mem_exit()
|
||||||
|
|
||||||
|
Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
|
||||||
|
|
||||||
|
2.1.2 APIs for the PCI Endpoint Function Driver
|
||||||
|
|
||||||
|
This section lists the APIs that the PCI Endpoint core provides to be used
|
||||||
|
by the PCI endpoint function driver.
|
||||||
|
|
||||||
|
*) pci_epc_write_header()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_write_header() to
|
||||||
|
write the standard configuration header to the endpoint controller.
|
||||||
|
|
||||||
|
*) pci_epc_set_bar()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_set_bar() to configure
|
||||||
|
the Base Address Register in order for the host to assign PCI addr space.
|
||||||
|
Register space of the function driver is usually configured
|
||||||
|
using this API.
|
||||||
|
|
||||||
|
*) pci_epc_clear_bar()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_clear_bar() to reset
|
||||||
|
the BAR.
|
||||||
|
|
||||||
|
*) pci_epc_raise_irq()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_raise_irq() to raise
|
||||||
|
Legacy Interrupt or MSI Interrupt.
|
||||||
|
|
||||||
|
*) pci_epc_mem_alloc_addr()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
|
||||||
|
allocate memory address from EPC addr space which is required to access
|
||||||
|
RC's buffer
|
||||||
|
|
||||||
|
*) pci_epc_mem_free_addr()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should use pci_epc_mem_free_addr() to
|
||||||
|
free the memory space allocated using pci_epc_mem_alloc_addr().
|
||||||
|
|
||||||
|
2.1.3 Other APIs
|
||||||
|
|
||||||
|
There are other APIs provided by the EPC library. These are used for binding
|
||||||
|
the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
|
||||||
|
using these APIs.
|
||||||
|
|
||||||
|
*) pci_epc_get()
|
||||||
|
|
||||||
|
Get a reference to the PCI endpoint controller based on the device name of
|
||||||
|
the controller.
|
||||||
|
|
||||||
|
*) pci_epc_put()
|
||||||
|
|
||||||
|
Release the reference to the PCI endpoint controller obtained using
|
||||||
|
pci_epc_get()
|
||||||
|
|
||||||
|
*) pci_epc_add_epf()
|
||||||
|
|
||||||
|
Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
|
||||||
|
can have up to 8 functions according to the specification.
|
||||||
|
|
||||||
|
*) pci_epc_remove_epf()
|
||||||
|
|
||||||
|
Remove the PCI endpoint function from PCI endpoint controller.
|
||||||
|
|
||||||
|
*) pci_epc_start()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should invoke pci_epc_start() once it
|
||||||
|
has configured the endpoint function and wants to start the PCI link.
|
||||||
|
|
||||||
|
*) pci_epc_stop()
|
||||||
|
|
||||||
|
The PCI endpoint function driver should invoke pci_epc_stop() to stop
|
||||||
|
the PCI LINK.
|
||||||
|
|
||||||
|
2.2 PCI Endpoint Function(EPF) Library
|
||||||
|
|
||||||
|
The EPF library provides APIs to be used by the function driver and the EPC
|
||||||
|
library to provide endpoint mode functionality.
|
||||||
|
|
||||||
|
2.2.1 APIs for the PCI Endpoint Function Driver
|
||||||
|
|
||||||
|
This section lists the APIs that the PCI Endpoint core provides to be used
|
||||||
|
by the PCI endpoint function driver.
|
||||||
|
|
||||||
|
*) pci_epf_register_driver()
|
||||||
|
|
||||||
|
The PCI Endpoint Function driver should implement the following ops:
|
||||||
|
* bind: ops to perform when a EPC device has been bound to EPF device
|
||||||
|
* unbind: ops to perform when a binding has been lost between a EPC
|
||||||
|
device and EPF device
|
||||||
|
* linkup: ops to perform when the EPC device has established a
|
||||||
|
connection with a host system
|
||||||
|
|
||||||
|
The PCI Function driver can then register the PCI EPF driver by using
|
||||||
|
pci_epf_register_driver().
|
||||||
|
|
||||||
|
*) pci_epf_unregister_driver()
|
||||||
|
|
||||||
|
The PCI Function driver can unregister the PCI EPF driver by using
|
||||||
|
pci_epf_unregister_driver().
|
||||||
|
|
||||||
|
*) pci_epf_alloc_space()
|
||||||
|
|
||||||
|
The PCI Function driver can allocate space for a particular BAR using
|
||||||
|
pci_epf_alloc_space().
|
||||||
|
|
||||||
|
*) pci_epf_free_space()
|
||||||
|
|
||||||
|
The PCI Function driver can free the allocated space
|
||||||
|
(using pci_epf_alloc_space) by invoking pci_epf_free_space().
|
||||||
|
|
||||||
|
2.2.2 APIs for the PCI Endpoint Controller Library
|
||||||
|
This section lists the APIs that the PCI Endpoint core provides to be used
|
||||||
|
by the PCI endpoint controller library.
|
||||||
|
|
||||||
|
*) pci_epf_linkup()
|
||||||
|
|
||||||
|
The PCI endpoint controller library invokes pci_epf_linkup() when the
|
||||||
|
EPC device has established the connection to the host.
|
||||||
|
|
||||||
|
2.2.2 Other APIs
|
||||||
|
There are other APIs provided by the EPF library. These are used to notify
|
||||||
|
the function driver when the EPF device is bound to the EPC device.
|
||||||
|
pci-ep-cfs.c can be used as reference for using these APIs.
|
||||||
|
|
||||||
|
*) pci_epf_create()
|
||||||
|
|
||||||
|
Create a new PCI EPF device by passing the name of the PCI EPF device.
|
||||||
|
This name will be used to bind the the EPF device to a EPF driver.
|
||||||
|
|
||||||
|
*) pci_epf_destroy()
|
||||||
|
|
||||||
|
Destroy the created PCI EPF device.
|
||||||
|
|
||||||
|
*) pci_epf_bind()
|
||||||
|
|
||||||
|
pci_epf_bind() should be invoked when the EPF device has been bound to
|
||||||
|
a EPC device.
|
||||||
|
|
||||||
|
*) pci_epf_unbind()
|
||||||
|
|
||||||
|
pci_epf_unbind() should be invoked when the binding between EPC device
|
||||||
|
and EPF device is lost.
|
66
Documentation/PCI/endpoint/pci-test-function.txt
Normal file
66
Documentation/PCI/endpoint/pci-test-function.txt
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
PCI TEST
|
||||||
|
Kishon Vijay Abraham I <kishon@ti.com>
|
||||||
|
|
||||||
|
Traditionally PCI RC has always been validated by using standard
|
||||||
|
PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
|
||||||
|
However with the addition of EP-core in linux kernel, it is possible
|
||||||
|
to configure a PCI controller that can operate in EP mode to work as
|
||||||
|
a test device.
|
||||||
|
|
||||||
|
The PCI endpoint test device is a virtual device (defined in software)
|
||||||
|
used to test the endpoint functionality and serve as a sample driver
|
||||||
|
for other PCI endpoint devices (to use the EP framework).
|
||||||
|
|
||||||
|
The PCI endpoint test device has the following registers:
|
||||||
|
|
||||||
|
1) PCI_ENDPOINT_TEST_MAGIC
|
||||||
|
2) PCI_ENDPOINT_TEST_COMMAND
|
||||||
|
3) PCI_ENDPOINT_TEST_STATUS
|
||||||
|
4) PCI_ENDPOINT_TEST_SRC_ADDR
|
||||||
|
5) PCI_ENDPOINT_TEST_DST_ADDR
|
||||||
|
6) PCI_ENDPOINT_TEST_SIZE
|
||||||
|
7) PCI_ENDPOINT_TEST_CHECKSUM
|
||||||
|
|
||||||
|
*) PCI_ENDPOINT_TEST_MAGIC
|
||||||
|
|
||||||
|
This register will be used to test BAR0. A known pattern will be written
|
||||||
|
and read back from MAGIC register to verify BAR0.
|
||||||
|
|
||||||
|
*) PCI_ENDPOINT_TEST_COMMAND:
|
||||||
|
|
||||||
|
This register will be used by the host driver to indicate the function
|
||||||
|
that the endpoint device must perform.
|
||||||
|
|
||||||
|
Bitfield Description:
|
||||||
|
Bit 0 : raise legacy IRQ
|
||||||
|
Bit 1 : raise MSI IRQ
|
||||||
|
Bit 2 - 7 : MSI interrupt number
|
||||||
|
Bit 8 : read command (read data from RC buffer)
|
||||||
|
Bit 9 : write command (write data to RC buffer)
|
||||||
|
Bit 10 : copy command (copy data from one RC buffer to another
|
||||||
|
RC buffer)
|
||||||
|
|
||||||
|
*) PCI_ENDPOINT_TEST_STATUS
|
||||||
|
|
||||||
|
This register reflects the status of the PCI endpoint device.
|
||||||
|
|
||||||
|
Bitfield Description:
|
||||||
|
Bit 0 : read success
|
||||||
|
Bit 1 : read fail
|
||||||
|
Bit 2 : write success
|
||||||
|
Bit 3 : write fail
|
||||||
|
Bit 4 : copy success
|
||||||
|
Bit 5 : copy fail
|
||||||
|
Bit 6 : IRQ raised
|
||||||
|
Bit 7 : source address is invalid
|
||||||
|
Bit 8 : destination address is invalid
|
||||||
|
|
||||||
|
*) PCI_ENDPOINT_TEST_SRC_ADDR
|
||||||
|
|
||||||
|
This register contains the source address (RC buffer address) for the
|
||||||
|
COPY/READ command.
|
||||||
|
|
||||||
|
*) PCI_ENDPOINT_TEST_DST_ADDR
|
||||||
|
|
||||||
|
This register contains the destination address (RC buffer address) for
|
||||||
|
the COPY/WRITE command.
|
179
Documentation/PCI/endpoint/pci-test-howto.txt
Normal file
179
Documentation/PCI/endpoint/pci-test-howto.txt
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
PCI TEST USERGUIDE
|
||||||
|
Kishon Vijay Abraham I <kishon@ti.com>
|
||||||
|
|
||||||
|
This document is a guide to help users use pci-epf-test function driver
|
||||||
|
and pci_endpoint_test host driver for testing PCI. The list of steps to
|
||||||
|
be followed in the host side and EP side is given below.
|
||||||
|
|
||||||
|
1. Endpoint Device
|
||||||
|
|
||||||
|
1.1 Endpoint Controller Devices
|
||||||
|
|
||||||
|
To find the list of endpoint controller devices in the system:
|
||||||
|
|
||||||
|
# ls /sys/class/pci_epc/
|
||||||
|
51000000.pcie_ep
|
||||||
|
|
||||||
|
If PCI_ENDPOINT_CONFIGFS is enabled
|
||||||
|
# ls /sys/kernel/config/pci_ep/controllers
|
||||||
|
51000000.pcie_ep
|
||||||
|
|
||||||
|
1.2 Endpoint Function Drivers
|
||||||
|
|
||||||
|
To find the list of endpoint function drivers in the system:
|
||||||
|
|
||||||
|
# ls /sys/bus/pci-epf/drivers
|
||||||
|
pci_epf_test
|
||||||
|
|
||||||
|
If PCI_ENDPOINT_CONFIGFS is enabled
|
||||||
|
# ls /sys/kernel/config/pci_ep/functions
|
||||||
|
pci_epf_test
|
||||||
|
|
||||||
|
1.3 Creating pci-epf-test Device
|
||||||
|
|
||||||
|
PCI endpoint function device can be created using the configfs. To create
|
||||||
|
pci-epf-test device, the following commands can be used
|
||||||
|
|
||||||
|
# mount -t configfs none /sys/kernel/config
|
||||||
|
# cd /sys/kernel/config/pci_ep/
|
||||||
|
# mkdir functions/pci_epf_test/func1
|
||||||
|
|
||||||
|
The "mkdir func1" above creates the pci-epf-test function device that will
|
||||||
|
be probed by pci_epf_test driver.
|
||||||
|
|
||||||
|
The PCI endpoint framework populates the directory with the following
|
||||||
|
configurable fields.
|
||||||
|
|
||||||
|
# ls functions/pci_epf_test/func1
|
||||||
|
baseclass_code interrupt_pin revid subsys_vendor_id
|
||||||
|
cache_line_size msi_interrupts subclass_code vendorid
|
||||||
|
deviceid progif_code subsys_id
|
||||||
|
|
||||||
|
The PCI endpoint function driver populates these entries with default values
|
||||||
|
when the device is bound to the driver. The pci-epf-test driver populates
|
||||||
|
vendorid with 0xffff and interrupt_pin with 0x0001
|
||||||
|
|
||||||
|
# cat functions/pci_epf_test/func1/vendorid
|
||||||
|
0xffff
|
||||||
|
# cat functions/pci_epf_test/func1/interrupt_pin
|
||||||
|
0x0001
|
||||||
|
|
||||||
|
1.4 Configuring pci-epf-test Device
|
||||||
|
|
||||||
|
The user can configure the pci-epf-test device using configfs entry. In order
|
||||||
|
to change the vendorid and the number of MSI interrupts used by the function
|
||||||
|
device, the following commands can be used.
|
||||||
|
|
||||||
|
# echo 0x104c > functions/pci_epf_test/func1/vendorid
|
||||||
|
# echo 0xb500 > functions/pci_epf_test/func1/deviceid
|
||||||
|
# echo 16 > functions/pci_epf_test/func1/msi_interrupts
|
||||||
|
|
||||||
|
1.5 Binding pci-epf-test Device to EP Controller
|
||||||
|
|
||||||
|
In order for the endpoint function device to be useful, it has to be bound to
|
||||||
|
a PCI endpoint controller driver. Use the configfs to bind the function
|
||||||
|
device to one of the controller driver present in the system.
|
||||||
|
|
||||||
|
# ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
|
||||||
|
|
||||||
|
Once the above step is completed, the PCI endpoint is ready to establish a link
|
||||||
|
with the host.
|
||||||
|
|
||||||
|
1.6 Start the Link
|
||||||
|
|
||||||
|
In order for the endpoint device to establish a link with the host, the _start_
|
||||||
|
field should be populated with '1'.
|
||||||
|
|
||||||
|
# echo 1 > controllers/51000000.pcie_ep/start
|
||||||
|
|
||||||
|
2. RootComplex Device
|
||||||
|
|
||||||
|
2.1 lspci Output
|
||||||
|
|
||||||
|
Note that the devices listed here correspond to the value populated in 1.4 above
|
||||||
|
|
||||||
|
00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
|
||||||
|
01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
|
||||||
|
|
||||||
|
2.2 Using Endpoint Test function Device
|
||||||
|
|
||||||
|
pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
|
||||||
|
tests. Before pcitest.sh can be used pcitest.c should be compiled using the
|
||||||
|
following commands.
|
||||||
|
|
||||||
|
cd <kernel-dir>
|
||||||
|
make headers_install ARCH=arm
|
||||||
|
arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest
|
||||||
|
cp pcitest <rootfs>/usr/sbin/
|
||||||
|
cp tools/pci/pcitest.sh <rootfs>
|
||||||
|
|
||||||
|
2.2.1 pcitest.sh Output
|
||||||
|
# ./pcitest.sh
|
||||||
|
BAR tests
|
||||||
|
|
||||||
|
BAR0: OKAY
|
||||||
|
BAR1: OKAY
|
||||||
|
BAR2: OKAY
|
||||||
|
BAR3: OKAY
|
||||||
|
BAR4: NOT OKAY
|
||||||
|
BAR5: NOT OKAY
|
||||||
|
|
||||||
|
Interrupt tests
|
||||||
|
|
||||||
|
LEGACY IRQ: NOT OKAY
|
||||||
|
MSI1: OKAY
|
||||||
|
MSI2: OKAY
|
||||||
|
MSI3: OKAY
|
||||||
|
MSI4: OKAY
|
||||||
|
MSI5: OKAY
|
||||||
|
MSI6: OKAY
|
||||||
|
MSI7: OKAY
|
||||||
|
MSI8: OKAY
|
||||||
|
MSI9: OKAY
|
||||||
|
MSI10: OKAY
|
||||||
|
MSI11: OKAY
|
||||||
|
MSI12: OKAY
|
||||||
|
MSI13: OKAY
|
||||||
|
MSI14: OKAY
|
||||||
|
MSI15: OKAY
|
||||||
|
MSI16: OKAY
|
||||||
|
MSI17: NOT OKAY
|
||||||
|
MSI18: NOT OKAY
|
||||||
|
MSI19: NOT OKAY
|
||||||
|
MSI20: NOT OKAY
|
||||||
|
MSI21: NOT OKAY
|
||||||
|
MSI22: NOT OKAY
|
||||||
|
MSI23: NOT OKAY
|
||||||
|
MSI24: NOT OKAY
|
||||||
|
MSI25: NOT OKAY
|
||||||
|
MSI26: NOT OKAY
|
||||||
|
MSI27: NOT OKAY
|
||||||
|
MSI28: NOT OKAY
|
||||||
|
MSI29: NOT OKAY
|
||||||
|
MSI30: NOT OKAY
|
||||||
|
MSI31: NOT OKAY
|
||||||
|
MSI32: NOT OKAY
|
||||||
|
|
||||||
|
Read Tests
|
||||||
|
|
||||||
|
READ ( 1 bytes): OKAY
|
||||||
|
READ ( 1024 bytes): OKAY
|
||||||
|
READ ( 1025 bytes): OKAY
|
||||||
|
READ (1024000 bytes): OKAY
|
||||||
|
READ (1024001 bytes): OKAY
|
||||||
|
|
||||||
|
Write Tests
|
||||||
|
|
||||||
|
WRITE ( 1 bytes): OKAY
|
||||||
|
WRITE ( 1024 bytes): OKAY
|
||||||
|
WRITE ( 1025 bytes): OKAY
|
||||||
|
WRITE (1024000 bytes): OKAY
|
||||||
|
WRITE (1024001 bytes): OKAY
|
||||||
|
|
||||||
|
Copy Tests
|
||||||
|
|
||||||
|
COPY ( 1 bytes): OKAY
|
||||||
|
COPY ( 1024 bytes): OKAY
|
||||||
|
COPY ( 1025 bytes): OKAY
|
||||||
|
COPY (1024000 bytes): OKAY
|
||||||
|
COPY (1024001 bytes): OKAY
|
@@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
Many PCI bus controllers are able to detect a variety of hardware
|
Many PCI bus controllers are able to detect a variety of hardware
|
||||||
PCI errors on the bus, such as parity errors on the data and address
|
PCI errors on the bus, such as parity errors on the data and address
|
||||||
busses, as well as SERR and PERR errors. Some of the more advanced
|
buses, as well as SERR and PERR errors. Some of the more advanced
|
||||||
chipsets are able to deal with these errors; these include PCI-E chipsets,
|
chipsets are able to deal with these errors; these include PCI-E chipsets,
|
||||||
and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
|
and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
|
||||||
pSeries boxes. A typical action taken is to disconnect the affected device,
|
pSeries boxes. A typical action taken is to disconnect the affected device,
|
||||||
@@ -173,7 +173,7 @@ is STEP 6 (Permanent Failure).
|
|||||||
>>> a value of 0xff on read, and writes will be dropped. If more than
|
>>> a value of 0xff on read, and writes will be dropped. If more than
|
||||||
>>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
|
>>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
|
||||||
>>> assumes that the device driver has gone into an infinite loop
|
>>> assumes that the device driver has gone into an infinite loop
|
||||||
>>> and prints an error to syslog. A reboot is then required to
|
>>> and prints an error to syslog. A reboot is then required to
|
||||||
>>> get the device working again.
|
>>> get the device working again.
|
||||||
|
|
||||||
STEP 2: MMIO Enabled
|
STEP 2: MMIO Enabled
|
||||||
@@ -231,14 +231,14 @@ proceeds to STEP 4 (Slot Reset)
|
|||||||
STEP 3: Link Reset
|
STEP 3: Link Reset
|
||||||
------------------
|
------------------
|
||||||
The platform resets the link. This is a PCI-Express specific step
|
The platform resets the link. This is a PCI-Express specific step
|
||||||
and is done whenever a non-fatal error has been detected that can be
|
and is done whenever a fatal error has been detected that can be
|
||||||
"solved" by resetting the link.
|
"solved" by resetting the link.
|
||||||
|
|
||||||
STEP 4: Slot Reset
|
STEP 4: Slot Reset
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
|
In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
|
||||||
the platform will perform a slot reset on the requesting PCI device(s).
|
the platform will perform a slot reset on the requesting PCI device(s).
|
||||||
The actual steps taken by a platform to perform a slot reset
|
The actual steps taken by a platform to perform a slot reset
|
||||||
will be platform-dependent. Upon completion of slot reset, the
|
will be platform-dependent. Upon completion of slot reset, the
|
||||||
platform will call the device slot_reset() callback.
|
platform will call the device slot_reset() callback.
|
||||||
@@ -258,7 +258,7 @@ configuration registers to initialize to their default conditions.
|
|||||||
|
|
||||||
For most PCI devices, a soft reset will be sufficient for recovery.
|
For most PCI devices, a soft reset will be sufficient for recovery.
|
||||||
Optional fundamental reset is provided to support a limited number
|
Optional fundamental reset is provided to support a limited number
|
||||||
of PCI Express PCI devices for which a soft reset is not sufficient
|
of PCI Express devices for which a soft reset is not sufficient
|
||||||
for recovery.
|
for recovery.
|
||||||
|
|
||||||
If the platform supports PCI hotplug, then the reset might be
|
If the platform supports PCI hotplug, then the reset might be
|
||||||
@@ -303,7 +303,7 @@ driver performs device init only from PCI function 0:
|
|||||||
Same as above.
|
Same as above.
|
||||||
|
|
||||||
Drivers for PCI Express cards that require a fundamental reset must
|
Drivers for PCI Express cards that require a fundamental reset must
|
||||||
set the needs_freset bit in the pci_dev structure in their probe function.
|
set the needs_freset bit in the pci_dev structure in their probe function.
|
||||||
For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
|
For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
|
||||||
PCI card types:
|
PCI card types:
|
||||||
|
|
||||||
|
@@ -68,6 +68,18 @@ To disable SR-IOV capability:
|
|||||||
echo 0 > \
|
echo 0 > \
|
||||||
/sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
|
/sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
|
||||||
|
|
||||||
|
To enable auto probing VFs by a compatible driver on the host, run
|
||||||
|
command below before enabling SR-IOV capabilities. This is the
|
||||||
|
default behavior.
|
||||||
|
echo 1 > \
|
||||||
|
/sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
|
||||||
|
|
||||||
|
To disable auto probing VFs by a compatible driver on the host, run
|
||||||
|
command below before enabling SR-IOV capabilities. Updating this
|
||||||
|
entry will not affect VFs which are already probed.
|
||||||
|
echo 0 > \
|
||||||
|
/sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
|
||||||
|
|
||||||
3.2 Usage example
|
3.2 Usage example
|
||||||
|
|
||||||
Following piece of code illustrates the usage of the SR-IOV API.
|
Following piece of code illustrates the usage of the SR-IOV API.
|
||||||
|
@@ -17,7 +17,7 @@ rcu_dereference.txt
|
|||||||
rcubarrier.txt
|
rcubarrier.txt
|
||||||
- RCU and Unloadable Modules
|
- RCU and Unloadable Modules
|
||||||
rculist_nulls.txt
|
rculist_nulls.txt
|
||||||
- RCU list primitives for use with SLAB_DESTROY_BY_RCU
|
- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
|
||||||
rcuref.txt
|
rcuref.txt
|
||||||
- Reference-count design for elements of lists/arrays protected by RCU
|
- Reference-count design for elements of lists/arrays protected by RCU
|
||||||
rcu.txt
|
rcu.txt
|
||||||
|
@@ -19,6 +19,8 @@ to each other.
|
|||||||
The <tt>rcu_state</tt> Structure</a>
|
The <tt>rcu_state</tt> Structure</a>
|
||||||
<li> <a href="#The rcu_node Structure">
|
<li> <a href="#The rcu_node Structure">
|
||||||
The <tt>rcu_node</tt> Structure</a>
|
The <tt>rcu_node</tt> Structure</a>
|
||||||
|
<li> <a href="#The rcu_segcblist Structure">
|
||||||
|
The <tt>rcu_segcblist</tt> Structure</a>
|
||||||
<li> <a href="#The rcu_data Structure">
|
<li> <a href="#The rcu_data Structure">
|
||||||
The <tt>rcu_data</tt> Structure</a>
|
The <tt>rcu_data</tt> Structure</a>
|
||||||
<li> <a href="#The rcu_dynticks Structure">
|
<li> <a href="#The rcu_dynticks Structure">
|
||||||
@@ -841,6 +843,134 @@ for lockdep lock-class names.
|
|||||||
Finally, lines 64-66 produce an error if the maximum number of
|
Finally, lines 64-66 produce an error if the maximum number of
|
||||||
CPUs is too large for the specified fanout.
|
CPUs is too large for the specified fanout.
|
||||||
|
|
||||||
|
<h3><a name="The rcu_segcblist Structure">
|
||||||
|
The <tt>rcu_segcblist</tt> Structure</a></h3>
|
||||||
|
|
||||||
|
The <tt>rcu_segcblist</tt> structure maintains a segmented list of
|
||||||
|
callbacks as follows:
|
||||||
|
|
||||||
|
<pre>
|
||||||
|
1 #define RCU_DONE_TAIL 0
|
||||||
|
2 #define RCU_WAIT_TAIL 1
|
||||||
|
3 #define RCU_NEXT_READY_TAIL 2
|
||||||
|
4 #define RCU_NEXT_TAIL 3
|
||||||
|
5 #define RCU_CBLIST_NSEGS 4
|
||||||
|
6
|
||||||
|
7 struct rcu_segcblist {
|
||||||
|
8 struct rcu_head *head;
|
||||||
|
9 struct rcu_head **tails[RCU_CBLIST_NSEGS];
|
||||||
|
10 unsigned long gp_seq[RCU_CBLIST_NSEGS];
|
||||||
|
11 long len;
|
||||||
|
12 long len_lazy;
|
||||||
|
13 };
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The segments are as follows:
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
|
||||||
|
These callbacks are ready to be invoked.
|
||||||
|
<li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
|
||||||
|
current grace period.
|
||||||
|
Note that different CPUs can have different ideas about which
|
||||||
|
grace period is current, hence the <tt>->gp_seq</tt> field.
|
||||||
|
<li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
|
||||||
|
grace period to start.
|
||||||
|
<li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
|
||||||
|
associated with a grace period.
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The <tt>->head</tt> pointer references the first callback or
|
||||||
|
is <tt>NULL</tt> if the list contains no callbacks (which is
|
||||||
|
<i>not</i> the same as being empty).
|
||||||
|
Each element of the <tt>->tails[]</tt> array references the
|
||||||
|
<tt>->next</tt> pointer of the last callback in the corresponding
|
||||||
|
segment of the list, or the list's <tt>->head</tt> pointer if
|
||||||
|
that segment and all previous segments are empty.
|
||||||
|
If the corresponding segment is empty but some previous segment is
|
||||||
|
not empty, then the array element is identical to its predecessor.
|
||||||
|
Older callbacks are closer to the head of the list, and new callbacks
|
||||||
|
are added at the tail.
|
||||||
|
This relationship between the <tt>->head</tt> pointer, the
|
||||||
|
<tt>->tails[]</tt> array, and the callbacks is shown in this
|
||||||
|
diagram:
|
||||||
|
|
||||||
|
</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
|
||||||
|
|
||||||
|
</p><p>In this figure, the <tt>->head</tt> pointer references the
|
||||||
|
first
|
||||||
|
RCU callback in the list.
|
||||||
|
The <tt>->tails[RCU_DONE_TAIL]</tt> array element references
|
||||||
|
the <tt>->head</tt> pointer itself, indicating that none
|
||||||
|
of the callbacks is ready to invoke.
|
||||||
|
The <tt>->tails[RCU_WAIT_TAIL]</tt> array element references callback
|
||||||
|
CB 2's <tt>->next</tt> pointer, which indicates that
|
||||||
|
CB 1 and CB 2 are both waiting on the current grace period,
|
||||||
|
give or take possible disagreements about exactly which grace period
|
||||||
|
is the current one.
|
||||||
|
The <tt>->tails[RCU_NEXT_READY_TAIL]</tt> array element
|
||||||
|
references the same RCU callback that <tt>->tails[RCU_WAIT_TAIL]</tt>
|
||||||
|
does, which indicates that there are no callbacks waiting on the next
|
||||||
|
RCU grace period.
|
||||||
|
The <tt>->tails[RCU_NEXT_TAIL]</tt> array element references
|
||||||
|
CB 4's <tt>->next</tt> pointer, indicating that all the
|
||||||
|
remaining RCU callbacks have not yet been assigned to an RCU grace
|
||||||
|
period.
|
||||||
|
Note that the <tt>->tails[RCU_NEXT_TAIL]</tt> array element
|
||||||
|
always references the last RCU callback's <tt>->next</tt> pointer
|
||||||
|
unless the callback list is empty, in which case it references
|
||||||
|
the <tt>->head</tt> pointer.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
There is one additional important special case for the
|
||||||
|
<tt>->tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
|
||||||
|
when this list is <i>disabled</i>.
|
||||||
|
Lists are disabled when the corresponding CPU is offline or when
|
||||||
|
the corresponding CPU's callbacks are offloaded to a kthread,
|
||||||
|
both of which are described elsewhere.
|
||||||
|
|
||||||
|
</p><p>CPUs advance their callbacks from the
|
||||||
|
<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
|
||||||
|
<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
|
||||||
|
as grace periods advance.
|
||||||
|
|
||||||
|
</p><p>The <tt>->gp_seq[]</tt> array records grace-period
|
||||||
|
numbers corresponding to the list segments.
|
||||||
|
This is what allows different CPUs to have different ideas as to
|
||||||
|
which is the current grace period while still avoiding premature
|
||||||
|
invocation of their callbacks.
|
||||||
|
In particular, this allows CPUs that go idle for extended periods
|
||||||
|
to determine which of their callbacks are ready to be invoked after
|
||||||
|
reawakening.
|
||||||
|
|
||||||
|
</p><p>The <tt>->len</tt> counter contains the number of
|
||||||
|
callbacks in <tt>->head</tt>, and the
|
||||||
|
<tt>->len_lazy</tt> contains the number of those callbacks that
|
||||||
|
are known to only free memory, and whose invocation can therefore
|
||||||
|
be safely deferred.
|
||||||
|
|
||||||
|
<p><b>Important note</b>: It is the <tt>->len</tt> field that
|
||||||
|
determines whether or not there are callbacks associated with
|
||||||
|
this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt>
|
||||||
|
pointer.
|
||||||
|
The reason for this is that all the ready-to-invoke callbacks
|
||||||
|
(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
|
||||||
|
all at once at callback-invocation time.
|
||||||
|
If callback invocation must be postponed, for example, because a
|
||||||
|
high-priority process just woke up on this CPU, then the remaining
|
||||||
|
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
|
||||||
|
Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||||
|
are adjusted after the corresponding callbacks have been invoked, and so
|
||||||
|
again it is the <tt>->len</tt> count that accurately reflects whether
|
||||||
|
or not there are callbacks associated with this <tt>rcu_segcblist</tt>
|
||||||
|
structure.
|
||||||
|
Of course, off-CPU sampling of the <tt>->len</tt> count requires
|
||||||
|
the use of appropriate synchronization, for example, memory barriers.
|
||||||
|
This synchronization can be a bit subtle, particularly in the case
|
||||||
|
of <tt>rcu_barrier()</tt>.
|
||||||
|
|
||||||
<h3><a name="The rcu_data Structure">
|
<h3><a name="The rcu_data Structure">
|
||||||
The <tt>rcu_data</tt> Structure</a></h3>
|
The <tt>rcu_data</tt> Structure</a></h3>
|
||||||
|
|
||||||
@@ -983,62 +1113,18 @@ choice.
|
|||||||
as follows:
|
as follows:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
1 struct rcu_head *nxtlist;
|
1 struct rcu_segcblist cblist;
|
||||||
2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
|
2 long qlen_last_fqs_check;
|
||||||
3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
|
3 unsigned long n_cbs_invoked;
|
||||||
4 long qlen_lazy;
|
4 unsigned long n_nocbs_invoked;
|
||||||
5 long qlen;
|
5 unsigned long n_cbs_orphaned;
|
||||||
6 long qlen_last_fqs_check;
|
6 unsigned long n_cbs_adopted;
|
||||||
7 unsigned long n_force_qs_snap;
|
7 unsigned long n_force_qs_snap;
|
||||||
8 unsigned long n_cbs_invoked;
|
8 long blimit;
|
||||||
9 unsigned long n_cbs_orphaned;
|
|
||||||
10 unsigned long n_cbs_adopted;
|
|
||||||
11 long blimit;
|
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
<p>The <tt>->nxtlist</tt> pointer and the
|
<p>The <tt>->cblist</tt> structure is the segmented callback list
|
||||||
<tt>->nxttail[]</tt> array form a four-segment list with
|
described earlier.
|
||||||
older callbacks near the head and newer ones near the tail.
|
|
||||||
Each segment contains callbacks with the corresponding relationship
|
|
||||||
to the current grace period.
|
|
||||||
The pointer out of the end of each of the four segments is referenced
|
|
||||||
by the element of the <tt>->nxttail[]</tt> array indexed by
|
|
||||||
<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
|
|
||||||
<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
|
|
||||||
<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
|
|
||||||
grace period), and
|
|
||||||
<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
|
|
||||||
with a specific grace period)
|
|
||||||
respectively, as shown in the following figure.
|
|
||||||
|
|
||||||
</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
|
|
||||||
|
|
||||||
</p><p>In this figure, the <tt>->nxtlist</tt> pointer references the
|
|
||||||
first
|
|
||||||
RCU callback in the list.
|
|
||||||
The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references
|
|
||||||
the <tt>->nxtlist</tt> pointer itself, indicating that none
|
|
||||||
of the callbacks is ready to invoke.
|
|
||||||
The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback
|
|
||||||
CB 2's <tt>->next</tt> pointer, which indicates that
|
|
||||||
CB 1 and CB 2 are both waiting on the current grace period.
|
|
||||||
The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element
|
|
||||||
references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt>
|
|
||||||
does, which indicates that there are no callbacks waiting on the next
|
|
||||||
RCU grace period.
|
|
||||||
The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references
|
|
||||||
CB 4's <tt>->next</tt> pointer, indicating that all the
|
|
||||||
remaining RCU callbacks have not yet been assigned to an RCU grace
|
|
||||||
period.
|
|
||||||
Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element
|
|
||||||
always references the last RCU callback's <tt>->next</tt> pointer
|
|
||||||
unless the callback list is empty, in which case it references
|
|
||||||
the <tt>->nxtlist</tt> pointer.
|
|
||||||
|
|
||||||
</p><p>CPUs advance their callbacks from the
|
|
||||||
<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
|
|
||||||
<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
|
|
||||||
as grace periods advance.
|
|
||||||
The CPU advances the callbacks in its <tt>rcu_data</tt> structure
|
The CPU advances the callbacks in its <tt>rcu_data</tt> structure
|
||||||
whenever it notices that another RCU grace period has completed.
|
whenever it notices that another RCU grace period has completed.
|
||||||
The CPU detects the completion of an RCU grace period by noticing
|
The CPU detects the completion of an RCU grace period by noticing
|
||||||
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
|
|||||||
<tt>->completed</tt> field is updated at the end of each
|
<tt>->completed</tt> field is updated at the end of each
|
||||||
grace period.
|
grace period.
|
||||||
|
|
||||||
</p><p>The <tt>->nxtcompleted[]</tt> array records grace-period
|
<p>
|
||||||
numbers corresponding to the list segments.
|
|
||||||
This allows CPUs that go idle for extended periods to determine
|
|
||||||
which of their callbacks are ready to be invoked after reawakening.
|
|
||||||
|
|
||||||
</p><p>The <tt>->qlen</tt> counter contains the number of
|
|
||||||
callbacks in <tt>->nxtlist</tt>, and the
|
|
||||||
<tt>->qlen_lazy</tt> contains the number of those callbacks that
|
|
||||||
are known to only free memory, and whose invocation can therefore
|
|
||||||
be safely deferred.
|
|
||||||
The <tt>->qlen_last_fqs_check</tt> and
|
The <tt>->qlen_last_fqs_check</tt> and
|
||||||
<tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent
|
<tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent
|
||||||
states from <tt>call_rcu()</tt> and friends when callback
|
states from <tt>call_rcu()</tt> and friends when callback
|
||||||
@@ -1069,6 +1146,10 @@ lists grow excessively long.
|
|||||||
fields count the number of callbacks invoked,
|
fields count the number of callbacks invoked,
|
||||||
sent to other CPUs when this CPU goes offline,
|
sent to other CPUs when this CPU goes offline,
|
||||||
and received from other CPUs when those other CPUs go offline.
|
and received from other CPUs when those other CPUs go offline.
|
||||||
|
The <tt>->n_nocbs_invoked</tt> is used when the CPU's callbacks
|
||||||
|
are offloaded to a kthread.
|
||||||
|
|
||||||
|
<p>
|
||||||
Finally, the <tt>->blimit</tt> counter is the maximum number of
|
Finally, the <tt>->blimit</tt> counter is the maximum number of
|
||||||
RCU callbacks that may be invoked at a given time.
|
RCU callbacks that may be invoked at a given time.
|
||||||
|
|
||||||
@@ -1104,6 +1185,9 @@ Its fields are as follows:
|
|||||||
1 int dynticks_nesting;
|
1 int dynticks_nesting;
|
||||||
2 int dynticks_nmi_nesting;
|
2 int dynticks_nmi_nesting;
|
||||||
3 atomic_t dynticks;
|
3 atomic_t dynticks;
|
||||||
|
4 bool rcu_need_heavy_qs;
|
||||||
|
5 unsigned long rcu_qs_ctr;
|
||||||
|
6 bool rcu_urgent_qs;
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
<p>The <tt>->dynticks_nesting</tt> field counts the
|
<p>The <tt>->dynticks_nesting</tt> field counts the
|
||||||
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>->dynticks_nmi_nesting</tt>
|
|||||||
field, except that NMIs that interrupt non-dyntick-idle execution
|
field, except that NMIs that interrupt non-dyntick-idle execution
|
||||||
are not counted.
|
are not counted.
|
||||||
|
|
||||||
</p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding
|
</p><p>The <tt>->dynticks</tt> field counts the corresponding
|
||||||
CPU's transitions to and from dyntick-idle mode, so that this counter
|
CPU's transitions to and from dyntick-idle mode, so that this counter
|
||||||
has an even value when the CPU is in dyntick-idle mode and an odd
|
has an even value when the CPU is in dyntick-idle mode and an odd
|
||||||
value otherwise.
|
value otherwise.
|
||||||
|
|
||||||
|
</p><p>The <tt>->rcu_need_heavy_qs</tt> field is used
|
||||||
|
to record the fact that the RCU core code would really like to
|
||||||
|
see a quiescent state from the corresponding CPU, so much so that
|
||||||
|
it is willing to call for heavy-weight dyntick-counter operations.
|
||||||
|
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||||
|
code, which provide a momentary idle sojourn in response.
|
||||||
|
|
||||||
|
</p><p>The <tt>->rcu_qs_ctr</tt> field is used to record
|
||||||
|
quiescent states from <tt>cond_resched()</tt>.
|
||||||
|
Because <tt>cond_resched()</tt> can execute quite frequently, this
|
||||||
|
must be quite lightweight, as in a non-atomic increment of this
|
||||||
|
per-CPU field.
|
||||||
|
|
||||||
|
</p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record
|
||||||
|
the fact that the RCU core code would really like to see a quiescent
|
||||||
|
state from the corresponding CPU, with the various other fields indicating
|
||||||
|
just how badly RCU wants this quiescent state.
|
||||||
|
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||||
|
code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt>
|
||||||
|
in response.
|
||||||
|
|
||||||
<table>
|
<table>
|
||||||
<tr><th> </th></tr>
|
<tr><th> </th></tr>
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
<tr><th align="left">Quick Quiz:</th></tr>
|
||||||
|
@@ -19,7 +19,7 @@
|
|||||||
id="svg2"
|
id="svg2"
|
||||||
version="1.1"
|
version="1.1"
|
||||||
inkscape:version="0.48.4 r9939"
|
inkscape:version="0.48.4 r9939"
|
||||||
sodipodi:docname="nxtlist.fig">
|
sodipodi:docname="segcblist.svg">
|
||||||
<metadata
|
<metadata
|
||||||
id="metadata94">
|
id="metadata94">
|
||||||
<rdf:RDF>
|
<rdf:RDF>
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
<dc:format>image/svg+xml</dc:format>
|
<dc:format>image/svg+xml</dc:format>
|
||||||
<dc:type
|
<dc:type
|
||||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||||
<dc:title></dc:title>
|
<dc:title />
|
||||||
</cc:Work>
|
</cc:Work>
|
||||||
</rdf:RDF>
|
</rdf:RDF>
|
||||||
</metadata>
|
</metadata>
|
||||||
@@ -241,61 +241,51 @@
|
|||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
x="225"
|
x="225"
|
||||||
y="675"
|
y="675"
|
||||||
fill="#000000"
|
|
||||||
font-family="Courier"
|
|
||||||
font-style="normal"
|
font-style="normal"
|
||||||
font-weight="bold"
|
font-weight="bold"
|
||||||
font-size="324"
|
font-size="324"
|
||||||
text-anchor="start"
|
id="text64"
|
||||||
id="text64">nxtlist</text>
|
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->head</text>
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<text
|
<text
|
||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
x="225"
|
x="225"
|
||||||
y="1800"
|
y="1800"
|
||||||
fill="#000000"
|
|
||||||
font-family="Courier"
|
|
||||||
font-style="normal"
|
font-style="normal"
|
||||||
font-weight="bold"
|
font-weight="bold"
|
||||||
font-size="324"
|
font-size="324"
|
||||||
text-anchor="start"
|
id="text66"
|
||||||
id="text66">nxttail[RCU_DONE_TAIL]</text>
|
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_DONE_TAIL]</text>
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<text
|
<text
|
||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
x="225"
|
x="225"
|
||||||
y="2925"
|
y="2925"
|
||||||
fill="#000000"
|
|
||||||
font-family="Courier"
|
|
||||||
font-style="normal"
|
font-style="normal"
|
||||||
font-weight="bold"
|
font-weight="bold"
|
||||||
font-size="324"
|
font-size="324"
|
||||||
text-anchor="start"
|
id="text68"
|
||||||
id="text68">nxttail[RCU_WAIT_TAIL]</text>
|
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_WAIT_TAIL]</text>
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<text
|
<text
|
||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
x="225"
|
x="225"
|
||||||
y="4050"
|
y="4050"
|
||||||
fill="#000000"
|
|
||||||
font-family="Courier"
|
|
||||||
font-style="normal"
|
font-style="normal"
|
||||||
font-weight="bold"
|
font-weight="bold"
|
||||||
font-size="324"
|
font-size="324"
|
||||||
text-anchor="start"
|
id="text70"
|
||||||
id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
|
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_READY_TAIL]</text>
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<text
|
<text
|
||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
x="225"
|
x="225"
|
||||||
y="5175"
|
y="5175"
|
||||||
fill="#000000"
|
|
||||||
font-family="Courier"
|
|
||||||
font-style="normal"
|
font-style="normal"
|
||||||
font-weight="bold"
|
font-weight="bold"
|
||||||
font-size="324"
|
font-size="324"
|
||||||
text-anchor="start"
|
id="text72"
|
||||||
id="text72">nxttail[RCU_NEXT_TAIL]</text>
|
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_TAIL]</text>
|
||||||
<!-- Text -->
|
<!-- Text -->
|
||||||
<text
|
<text
|
||||||
xml:space="preserve"
|
xml:space="preserve"
|
||||||
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
|
|||||||
Funnel locking and wait/wakeup</a>.
|
Funnel locking and wait/wakeup</a>.
|
||||||
<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
|
<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
|
||||||
<li> <a href="#Stall Warnings">Stall warnings</a>.
|
<li> <a href="#Stall Warnings">Stall warnings</a>.
|
||||||
|
<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
|
<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
|
||||||
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
|
|||||||
In earlier implementations, the task requesting the expedited
|
In earlier implementations, the task requesting the expedited
|
||||||
grace period also drove it to completion.
|
grace period also drove it to completion.
|
||||||
This straightforward approach had the disadvantage of needing to
|
This straightforward approach had the disadvantage of needing to
|
||||||
account for signals sent to user tasks,
|
account for POSIX signals sent to user tasks,
|
||||||
so more recent implemementations use the Linux kernel's
|
so more recent implemementations use the Linux kernel's
|
||||||
<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
|
<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
|
||||||
|
|
||||||
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
|
|||||||
processing, but the task reaching the top of the funnel lock
|
processing, but the task reaching the top of the funnel lock
|
||||||
does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
|
does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
|
||||||
so that a workqueue kthread does the actual grace-period processing.
|
so that a workqueue kthread does the actual grace-period processing.
|
||||||
Because workqueue kthreads do not accept signals, grace-period-wait
|
Because workqueue kthreads do not accept POSIX signals, grace-period-wait
|
||||||
processing need not allow for signals.
|
processing need not allow for POSIX signals.
|
||||||
|
|
||||||
In addition, this approach allows wakeups for the previous expedited
|
In addition, this approach allows wakeups for the previous expedited
|
||||||
grace period to be overlapped with processing for the next expedited
|
grace period to be overlapped with processing for the next expedited
|
||||||
@@ -586,6 +587,46 @@ blocking the current grace period are printed.
|
|||||||
Each stall warning results in another pass through the loop, but the
|
Each stall warning results in another pass through the loop, but the
|
||||||
second and subsequent passes use longer stall times.
|
second and subsequent passes use longer stall times.
|
||||||
|
|
||||||
|
<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The use of workqueues has the advantage that the expedited
|
||||||
|
grace-period code need not worry about POSIX signals.
|
||||||
|
Unfortunately, it has the
|
||||||
|
corresponding disadvantage that workqueues cannot be used until
|
||||||
|
they are initialized, which does not happen until some time after
|
||||||
|
the scheduler spawns the first task.
|
||||||
|
Given that there are parts of the kernel that really do want to
|
||||||
|
execute grace periods during this mid-boot “dead zone”,
|
||||||
|
expedited grace periods must do something else during thie time.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
What they do is to fall back to the old practice of requiring that the
|
||||||
|
requesting task drive the expedited grace period, as was the case
|
||||||
|
before the use of workqueues.
|
||||||
|
However, the requesting task is only required to drive the grace period
|
||||||
|
during the mid-boot dead zone.
|
||||||
|
Before mid-boot, a synchronous grace period is a no-op.
|
||||||
|
Some time after mid-boot, workqueues are used.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Non-expedited non-SRCU synchronous grace periods must also operate
|
||||||
|
normally during mid-boot.
|
||||||
|
This is handled by causing non-expedited grace periods to take the
|
||||||
|
expedited code path during mid-boot.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The current code assumes that there are no POSIX signals during
|
||||||
|
the mid-boot dead zone.
|
||||||
|
However, if an overwhelming need for POSIX signals somehow arises,
|
||||||
|
appropriate adjustments can be made to the expedited stall-warning code.
|
||||||
|
One such adjustment would reinstate the pre-workqueue stall-warning
|
||||||
|
checks, but only during the mid-boot dead zone.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
With this refinement, synchronous grace periods can now be used from
|
||||||
|
task context pretty much any time during the life of the kernel.
|
||||||
|
|
||||||
<h3><a name="Summary">
|
<h3><a name="Summary">
|
||||||
Summary</a></h3>
|
Summary</a></h3>
|
||||||
|
|
||||||
|
@@ -659,8 +659,9 @@ systems with more than one CPU:
|
|||||||
In other words, a given instance of <tt>synchronize_rcu()</tt>
|
In other words, a given instance of <tt>synchronize_rcu()</tt>
|
||||||
can avoid waiting on a given RCU read-side critical section only
|
can avoid waiting on a given RCU read-side critical section only
|
||||||
if it can prove that <tt>synchronize_rcu()</tt> started first.
|
if it can prove that <tt>synchronize_rcu()</tt> started first.
|
||||||
|
</font>
|
||||||
|
|
||||||
<p>
|
<p><font color="ffffff">
|
||||||
A related question is “When <tt>rcu_read_lock()</tt>
|
A related question is “When <tt>rcu_read_lock()</tt>
|
||||||
doesn't generate any code, why does it matter how it relates
|
doesn't generate any code, why does it matter how it relates
|
||||||
to a grace period?”
|
to a grace period?”
|
||||||
@@ -675,8 +676,9 @@ systems with more than one CPU:
|
|||||||
within the critical section, in which case none of the accesses
|
within the critical section, in which case none of the accesses
|
||||||
within the critical section may observe the effects of any
|
within the critical section may observe the effects of any
|
||||||
access following the grace period.
|
access following the grace period.
|
||||||
|
</font>
|
||||||
|
|
||||||
<p>
|
<p><font color="ffffff">
|
||||||
As of late 2016, mathematical models of RCU take this
|
As of late 2016, mathematical models of RCU take this
|
||||||
viewpoint, for example, see slides 62 and 63
|
viewpoint, for example, see slides 62 and 63
|
||||||
of the
|
of the
|
||||||
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress.
|
|||||||
In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
|
In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
|
||||||
is permitted to impose modest degradation of real-time latency
|
is permitted to impose modest degradation of real-time latency
|
||||||
on non-idle online CPUs.
|
on non-idle online CPUs.
|
||||||
That said, it will likely be necessary to take further steps to reduce this
|
Here, “modest” means roughly the same latency
|
||||||
degradation, hopefully to roughly that of a scheduling-clock interrupt.
|
degradation as a scheduling-clock interrupt.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
There are a number of situations where even
|
There are a number of situations where even
|
||||||
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods,
|
|||||||
but it is also the driving force behind the checks for large numbers
|
but it is also the driving force behind the checks for large numbers
|
||||||
of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
|
of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
|
||||||
Finally, high update rates should not delay RCU read-side critical
|
Finally, high update rates should not delay RCU read-side critical
|
||||||
sections, although some read-side delays can occur when using
|
sections, although some small read-side delays can occur when using
|
||||||
<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
|
<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
|
||||||
of <tt>try_stop_cpus()</tt>.
|
of <tt>smp_call_function_single()</tt>.
|
||||||
(In the future, <tt>synchronize_rcu_expedited()</tt> will be
|
|
||||||
converted to use lighter-weight inter-processor interrupts (IPIs),
|
|
||||||
but this will still disturb readers, though to a much smaller degree.)
|
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
Although all three of these corner cases were understood in the early
|
Although all three of these corner cases were understood in the early
|
||||||
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>.
|
|||||||
<p>
|
<p>
|
||||||
Although <tt>call_rcu()</tt> may be invoked at any
|
Although <tt>call_rcu()</tt> may be invoked at any
|
||||||
time during boot, callbacks are not guaranteed to be invoked until after
|
time during boot, callbacks are not guaranteed to be invoked until after
|
||||||
the scheduler is fully up and running.
|
all of RCU's kthreads have been spawned, which occurs at
|
||||||
|
<tt>early_initcall()</tt> time.
|
||||||
This delay in callback invocation is due to the fact that RCU does not
|
This delay in callback invocation is due to the fact that RCU does not
|
||||||
invoke callbacks until it is fully initialized, and this full initialization
|
invoke callbacks until it is fully initialized, and this full initialization
|
||||||
cannot occur until after the scheduler has initialized itself to the
|
cannot occur until after the scheduler has initialized itself to the
|
||||||
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke.
|
|||||||
Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
|
Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
|
||||||
<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
|
<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
|
||||||
(<a href="#Bottom-Half Flavor">discussed below</a>),
|
(<a href="#Bottom-Half Flavor">discussed below</a>),
|
||||||
and
|
<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
|
||||||
<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
|
<tt>synchronize_rcu_expedited()</tt>,
|
||||||
|
<tt>synchronize_rcu_bh_expedited()</tt>, and
|
||||||
|
<tt>synchronize_sched_expedited()</tt>
|
||||||
will all operate normally
|
will all operate normally
|
||||||
during very early boot, the reason being that there is only one CPU
|
during very early boot, the reason being that there is only one CPU
|
||||||
and preemption is disabled.
|
and preemption is disabled.
|
||||||
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can
|
|||||||
be a no-op.
|
be a no-op.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
|
However, once the scheduler has spawned its first kthread, this early
|
||||||
continue to operate normally through the remainder of boot, courtesy
|
boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
|
||||||
of the fact that preemption is disabled across their RCU read-side
|
<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
|
||||||
critical sections and also courtesy of the fact that there is still
|
kernels.
|
||||||
only one CPU.
|
The reason is that an RCU read-side critical section might be preempted,
|
||||||
However, once the scheduler starts initializing, preemption is enabled.
|
which means that a subsequent <tt>synchronize_rcu()</tt> really does have
|
||||||
There is still only a single CPU, but the fact that preemption is enabled
|
to wait for something, as opposed to simply returning immediately.
|
||||||
means that the no-op implementation of <tt>synchronize_rcu()</tt> no
|
Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
|
||||||
longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
|
its kthreads are spawned, which doesn't happen until some time during
|
||||||
Therefore, as soon as the scheduler starts initializing, the early-boot
|
<tt>early_initcalls()</tt> time.
|
||||||
fastpath is disabled.
|
But this is no excuse: RCU is nevertheless required to correctly handle
|
||||||
This means that <tt>synchronize_rcu()</tt> switches to its runtime
|
synchronous grace periods during this time period.
|
||||||
mode of operation where it posts callbacks, which in turn means that
|
Once all of its kthreads are up and running, RCU starts running
|
||||||
any call to <tt>synchronize_rcu()</tt> will block until the corresponding
|
normally.
|
||||||
callback is invoked.
|
|
||||||
Unfortunately, the callback cannot be invoked until RCU's runtime
|
|
||||||
grace-period machinery is up and running, which cannot happen until
|
|
||||||
the scheduler has initialized itself sufficiently to allow RCU's
|
|
||||||
kthreads to be spawned.
|
|
||||||
Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
|
|
||||||
initialization can result in deadlock.
|
|
||||||
|
|
||||||
<table>
|
<table>
|
||||||
<tr><th> </th></tr>
|
<tr><th> </th></tr>
|
||||||
<tr><th align="left">Quick Quiz:</th></tr>
|
<tr><th align="left">Quick Quiz:</th></tr>
|
||||||
<tr><td>
|
<tr><td>
|
||||||
So what happens with <tt>synchronize_rcu()</tt> during
|
How can RCU possibly handle grace periods before all of its
|
||||||
scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
|
kthreads have been spawned???
|
||||||
kernels?
|
|
||||||
</td></tr>
|
</td></tr>
|
||||||
<tr><th align="left">Answer:</th></tr>
|
<tr><th align="left">Answer:</th></tr>
|
||||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||||
In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
|
Very carefully!
|
||||||
maps directly to <tt>synchronize_sched()</tt>.
|
</font>
|
||||||
Therefore, <tt>synchronize_rcu()</tt> works normally throughout
|
|
||||||
boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
|
<p><font color="ffffff">
|
||||||
However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
|
During the “dead zone” between the time that the
|
||||||
so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
|
scheduler spawns the first task and the time that all of RCU's
|
||||||
during scheduler initialization.
|
kthreads have been spawned, all synchronous grace periods are
|
||||||
|
handled by the expedited grace-period mechanism.
|
||||||
|
At runtime, this expedited mechanism relies on workqueues, but
|
||||||
|
during the dead zone the requesting task itself drives the
|
||||||
|
desired expedited grace period.
|
||||||
|
Because dead-zone execution takes place within task context,
|
||||||
|
everything works.
|
||||||
|
Once the dead zone ends, expedited grace periods go back to
|
||||||
|
using workqueues, as is required to avoid problems that would
|
||||||
|
otherwise occur when a user task received a POSIX signal while
|
||||||
|
driving an expedited grace period.
|
||||||
|
</font>
|
||||||
|
|
||||||
|
<p><font color="ffffff">
|
||||||
|
And yes, this does mean that it is unhelpful to send POSIX
|
||||||
|
signals to random tasks between the time that the scheduler
|
||||||
|
spawns its first kthread and the time that RCU's kthreads
|
||||||
|
have all been spawned.
|
||||||
|
If there ever turns out to be a good reason for sending POSIX
|
||||||
|
signals during that time, appropriate adjustments will be made.
|
||||||
|
(If it turns out that POSIX signals are sent during this time for
|
||||||
|
no good reason, other adjustments will be made, appropriate
|
||||||
|
or otherwise.)
|
||||||
</font></td></tr>
|
</font></td></tr>
|
||||||
<tr><td> </td></tr>
|
<tr><td> </td></tr>
|
||||||
</table>
|
</table>
|
||||||
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
|
|||||||
The need for <tt>rcu_barrier()</tt> for module unloading became
|
The need for <tt>rcu_barrier()</tt> for module unloading became
|
||||||
apparent later.
|
apparent later.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
|
||||||
|
repeat, <i>not</i>, obligated to wait for a grace period.
|
||||||
|
It is instead only required to wait for RCU callbacks that have
|
||||||
|
already been posted.
|
||||||
|
Therefore, if there are no RCU callbacks posted anywhere in the system,
|
||||||
|
<tt>rcu_barrier()</tt> is within its rights to return immediately.
|
||||||
|
Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
|
||||||
|
necessarily need to wait for a grace period.
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr><th> </th></tr>
|
||||||
|
<tr><th align="left">Quick Quiz:</th></tr>
|
||||||
|
<tr><td>
|
||||||
|
Wait a minute!
|
||||||
|
Each RCU callbacks must wait for a grace period to complete,
|
||||||
|
and <tt>rcu_barrier()</tt> must wait for each pre-existing
|
||||||
|
callback to be invoked.
|
||||||
|
Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
|
||||||
|
a full grace period if there is even one callback posted anywhere
|
||||||
|
in the system?
|
||||||
|
</td></tr>
|
||||||
|
<tr><th align="left">Answer:</th></tr>
|
||||||
|
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||||
|
Absolutely not!!!
|
||||||
|
</font>
|
||||||
|
|
||||||
|
<p><font color="ffffff">
|
||||||
|
Yes, each RCU callbacks must wait for a grace period to complete,
|
||||||
|
but it might well be partly (or even completely) finished waiting
|
||||||
|
by the time <tt>rcu_barrier()</tt> is invoked.
|
||||||
|
In that case, <tt>rcu_barrier()</tt> need only wait for the
|
||||||
|
remaining portion of the grace period to elapse.
|
||||||
|
So even if there are quite a few callbacks posted,
|
||||||
|
<tt>rcu_barrier()</tt> might well return quite quickly.
|
||||||
|
</font>
|
||||||
|
|
||||||
|
<p><font color="ffffff">
|
||||||
|
So if you need to wait for a grace period as well as for all
|
||||||
|
pre-existing callbacks, you will need to invoke both
|
||||||
|
<tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
|
||||||
|
If latency is a concern, you can always use workqueues
|
||||||
|
to invoke them concurrently.
|
||||||
|
</font></td></tr>
|
||||||
|
<tr><td> </td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
|
<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
The Linux kernel supports CPU hotplug, which means that CPUs
|
The Linux kernel supports CPU hotplug, which means that CPUs
|
||||||
can come and go.
|
can come and go.
|
||||||
It is of course illegal to use any RCU API member from an offline CPU.
|
It is of course illegal to use any RCU API member from an offline CPU,
|
||||||
|
with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
|
||||||
|
critical sections.
|
||||||
This requirement was present from day one in DYNIX/ptx, but
|
This requirement was present from day one in DYNIX/ptx, but
|
||||||
on the other hand, the Linux kernel's CPU-hotplug implementation
|
on the other hand, the Linux kernel's CPU-hotplug implementation
|
||||||
is “interesting.”
|
is “interesting.”
|
||||||
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
|
|||||||
are used to allow the various kernel subsystems (including RCU)
|
are used to allow the various kernel subsystems (including RCU)
|
||||||
to respond appropriately to a given CPU-hotplug operation.
|
to respond appropriately to a given CPU-hotplug operation.
|
||||||
Most RCU operations may be invoked from CPU-hotplug notifiers,
|
Most RCU operations may be invoked from CPU-hotplug notifiers,
|
||||||
including even normal synchronous grace-period operations
|
including even synchronous grace-period operations such as
|
||||||
such as <tt>synchronize_rcu()</tt>.
|
<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
|
||||||
However, expedited grace-period operations such as
|
|
||||||
<tt>synchronize_rcu_expedited()</tt> are not supported,
|
|
||||||
due to the fact that current implementations block CPU-hotplug
|
|
||||||
operations, which could result in deadlock.
|
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
In addition, all-callback-wait operations such as
|
However, all-callback-wait operations such as
|
||||||
<tt>rcu_barrier()</tt> are also not supported, due to the
|
<tt>rcu_barrier()</tt> are also not supported, due to the
|
||||||
fact that there are phases of CPU-hotplug operations where
|
fact that there are phases of CPU-hotplug operations where
|
||||||
the outgoing CPU's callbacks will not be invoked until after
|
the outgoing CPU's callbacks will not be invoked until after
|
||||||
the CPU-hotplug operation ends, which could also result in deadlock.
|
the CPU-hotplug operation ends, which could also result in deadlock.
|
||||||
|
Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
|
||||||
|
during its execution, which results in another type of deadlock
|
||||||
|
when invoked from a CPU-hotplug notifier.
|
||||||
|
|
||||||
<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
|
<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
|
||||||
|
|
||||||
@@ -2863,6 +2927,27 @@ It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
|
|||||||
API, which, in combination with <tt>srcu_read_unlock()</tt>,
|
API, which, in combination with <tt>srcu_read_unlock()</tt>,
|
||||||
guarantees a full memory barrier.
|
guarantees a full memory barrier.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Also unlike other RCU flavors, SRCU's callbacks-wait function
|
||||||
|
<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
|
||||||
|
though this is not necessarily a good idea.
|
||||||
|
The reason that this is possible is that SRCU is insensitive
|
||||||
|
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
|
||||||
|
need not exclude CPU-hotplug operations.
|
||||||
|
|
||||||
|
<p>
|
||||||
|
As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
|
||||||
|
a locking bottleneck present in prior kernel versions.
|
||||||
|
Although this will allow users to put much heavier stress on
|
||||||
|
<tt>call_srcu()</tt>, it is important to note that SRCU does not
|
||||||
|
yet take any special steps to deal with callback flooding.
|
||||||
|
So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
|
||||||
|
you are probably totally OK, but if you intend to post (say) 1,000,000
|
||||||
|
SRCU callbacks per second per CPU, please run some tests first.
|
||||||
|
SRCU just might need a few adjustment to deal with that sort of load.
|
||||||
|
Of course, your mileage may vary based on the speed of your CPUs and
|
||||||
|
the size of your memory.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
The
|
The
|
||||||
<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
|
<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
|
||||||
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.
|
|||||||
|
|
||||||
<p>
|
<p>
|
||||||
RCU disables CPU hotplug in a few places, perhaps most notably in the
|
RCU disables CPU hotplug in a few places, perhaps most notably in the
|
||||||
expedited grace-period and <tt>rcu_barrier()</tt> operations.
|
<tt>rcu_barrier()</tt> operations.
|
||||||
If there is a strong reason to use expedited grace periods in CPU-hotplug
|
If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
|
||||||
notifiers, it will be necessary to avoid disabling CPU hotplug.
|
notifiers, it will be necessary to avoid disabling CPU hotplug.
|
||||||
This would introduce some complexity, so there had better be a <i>very</i>
|
This would introduce some complexity, so there had better be a <i>very</i>
|
||||||
good reason.
|
good reason.
|
||||||
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering
|
|||||||
this article human readable, and to Michelle Rankin for her support
|
this article human readable, and to Michelle Rankin for her support
|
||||||
of this effort.
|
of this effort.
|
||||||
Other contributions are acknowledged in the Linux kernel's git archive.
|
Other contributions are acknowledged in the Linux kernel's git archive.
|
||||||
The cartoon is copyright (c) 2013 by Melissa Broussard,
|
|
||||||
and is provided
|
|
||||||
under the terms of the Creative Commons Attribution-Share Alike 3.0
|
|
||||||
United States license.
|
|
||||||
|
|
||||||
</body></html>
|
</body></html>
|
||||||
|
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from
|
|||||||
This sort of comparison occurs frequently when scanning
|
This sort of comparison occurs frequently when scanning
|
||||||
RCU-protected circular linked lists.
|
RCU-protected circular linked lists.
|
||||||
|
|
||||||
|
Note that if checks for being within an RCU read-side
|
||||||
|
critical section are not required and the pointer is never
|
||||||
|
dereferenced, rcu_access_pointer() should be used in place
|
||||||
|
of rcu_dereference(). The rcu_access_pointer() primitive
|
||||||
|
does not require an enclosing read-side critical section,
|
||||||
|
and also omits the smp_read_barrier_depends() included in
|
||||||
|
rcu_dereference(), which in turn should provide a small
|
||||||
|
performance gain in some CPUs (e.g., the DEC Alpha).
|
||||||
|
|
||||||
o The comparison is against a pointer that references memory
|
o The comparison is against a pointer that references memory
|
||||||
that was initialized "a long time ago." The reason
|
that was initialized "a long time ago." The reason
|
||||||
this is safe is that even if misordering occurs, the
|
this is safe is that even if misordering occurs, the
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
Using hlist_nulls to protect read-mostly linked lists and
|
Using hlist_nulls to protect read-mostly linked lists and
|
||||||
objects using SLAB_DESTROY_BY_RCU allocations.
|
objects using SLAB_TYPESAFE_BY_RCU allocations.
|
||||||
|
|
||||||
Please read the basics in Documentation/RCU/listRCU.txt
|
Please read the basics in Documentation/RCU/listRCU.txt
|
||||||
|
|
||||||
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
|
|||||||
to solve following problem :
|
to solve following problem :
|
||||||
|
|
||||||
A typical RCU linked list managing objects which are
|
A typical RCU linked list managing objects which are
|
||||||
allocated with SLAB_DESTROY_BY_RCU kmem_cache can
|
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
|
||||||
use following algos :
|
use following algos :
|
||||||
|
|
||||||
1) Lookup algo
|
1) Lookup algo
|
||||||
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
|
|||||||
3) Remove algo
|
3) Remove algo
|
||||||
--------------
|
--------------
|
||||||
Nothing special here, we can use a standard RCU hlist deletion.
|
Nothing special here, we can use a standard RCU hlist deletion.
|
||||||
But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
|
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
|
||||||
very very fast (before the end of RCU grace period)
|
very very fast (before the end of RCU grace period)
|
||||||
|
|
||||||
if (put_last_reference_on(obj) {
|
if (put_last_reference_on(obj) {
|
||||||
|
@@ -1,9 +1,102 @@
|
|||||||
Using RCU's CPU Stall Detector
|
Using RCU's CPU Stall Detector
|
||||||
|
|
||||||
The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall
|
This document first discusses what sorts of issues RCU's CPU stall
|
||||||
detector, which detects conditions that unduly delay RCU grace periods.
|
detector can locate, and then discusses kernel parameters and Kconfig
|
||||||
This module parameter enables CPU stall detection by default, but
|
options that can be used to fine-tune the detector's operation. Finally,
|
||||||
may be overridden via boot-time parameter or at runtime via sysfs.
|
this document explains the stall detector's "splat" format.
|
||||||
|
|
||||||
|
|
||||||
|
What Causes RCU CPU Stall Warnings?
|
||||||
|
|
||||||
|
So your kernel printed an RCU CPU stall warning. The next question is
|
||||||
|
"What caused it?" The following problems can result in RCU CPU stall
|
||||||
|
warnings:
|
||||||
|
|
||||||
|
o A CPU looping in an RCU read-side critical section.
|
||||||
|
|
||||||
|
o A CPU looping with interrupts disabled.
|
||||||
|
|
||||||
|
o A CPU looping with preemption disabled. This condition can
|
||||||
|
result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
|
||||||
|
stalls.
|
||||||
|
|
||||||
|
o A CPU looping with bottom halves disabled. This condition can
|
||||||
|
result in RCU-sched and RCU-bh stalls.
|
||||||
|
|
||||||
|
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
|
||||||
|
kernel without invoking schedule(). Note that cond_resched()
|
||||||
|
does not necessarily prevent RCU CPU stall warnings. Therefore,
|
||||||
|
if the looping in the kernel is really expected and desirable
|
||||||
|
behavior, you might need to replace some of the cond_resched()
|
||||||
|
calls with calls to cond_resched_rcu_qs().
|
||||||
|
|
||||||
|
o Booting Linux using a console connection that is too slow to
|
||||||
|
keep up with the boot-time console-message rate. For example,
|
||||||
|
a 115Kbaud serial console can be -way- too slow to keep up
|
||||||
|
with boot-time message rates, and will frequently result in
|
||||||
|
RCU CPU stall warning messages. Especially if you have added
|
||||||
|
debug printk()s.
|
||||||
|
|
||||||
|
o Anything that prevents RCU's grace-period kthreads from running.
|
||||||
|
This can result in the "All QSes seen" console-log message.
|
||||||
|
This message will include information on when the kthread last
|
||||||
|
ran and how often it should be expected to run.
|
||||||
|
|
||||||
|
o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||||
|
happen to preempt a low-priority task in the middle of an RCU
|
||||||
|
read-side critical section. This is especially damaging if
|
||||||
|
that low-priority task is not permitted to run on any other CPU,
|
||||||
|
in which case the next RCU grace period can never complete, which
|
||||||
|
will eventually cause the system to run out of memory and hang.
|
||||||
|
While the system is in the process of running itself out of
|
||||||
|
memory, you might see stall-warning messages.
|
||||||
|
|
||||||
|
o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||||
|
is running at a higher priority than the RCU softirq threads.
|
||||||
|
This will prevent RCU callbacks from ever being invoked,
|
||||||
|
and in a CONFIG_PREEMPT_RCU kernel will further prevent
|
||||||
|
RCU grace periods from ever completing. Either way, the
|
||||||
|
system will eventually run out of memory and hang. In the
|
||||||
|
CONFIG_PREEMPT_RCU case, you might see stall-warning
|
||||||
|
messages.
|
||||||
|
|
||||||
|
o A hardware or software issue shuts off the scheduler-clock
|
||||||
|
interrupt on a CPU that is not in dyntick-idle mode. This
|
||||||
|
problem really has happened, and seems to be most likely to
|
||||||
|
result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
|
||||||
|
|
||||||
|
o A bug in the RCU implementation.
|
||||||
|
|
||||||
|
o A hardware failure. This is quite unlikely, but has occurred
|
||||||
|
at least once in real life. A CPU failed in a running system,
|
||||||
|
becoming unresponsive, but not causing an immediate crash.
|
||||||
|
This resulted in a series of RCU CPU stall warnings, eventually
|
||||||
|
leading the realization that the CPU had failed.
|
||||||
|
|
||||||
|
The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
|
||||||
|
warning. Note that SRCU does -not- have CPU stall warnings. Please note
|
||||||
|
that RCU only detects CPU stalls when there is a grace period in progress.
|
||||||
|
No grace period, no CPU stall warnings.
|
||||||
|
|
||||||
|
To diagnose the cause of the stall, inspect the stack traces.
|
||||||
|
The offending function will usually be near the top of the stack.
|
||||||
|
If you have a series of stall warnings from a single extended stall,
|
||||||
|
comparing the stack traces can often help determine where the stall
|
||||||
|
is occurring, which will usually be in the function nearest the top of
|
||||||
|
that portion of the stack which remains the same from trace to trace.
|
||||||
|
If you can reliably trigger the stall, ftrace can be quite helpful.
|
||||||
|
|
||||||
|
RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
|
||||||
|
and with RCU's event tracing. For information on RCU's event tracing,
|
||||||
|
see include/trace/events/rcu.h.
|
||||||
|
|
||||||
|
|
||||||
|
Fine-Tuning the RCU CPU Stall Detector
|
||||||
|
|
||||||
|
The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
|
||||||
|
CPU stall detector, which detects conditions that unduly delay RCU grace
|
||||||
|
periods. This module parameter enables CPU stall detection by default,
|
||||||
|
but may be overridden via boot-time parameter or at runtime via sysfs.
|
||||||
The stall detector's idea of what constitutes "unduly delayed" is
|
The stall detector's idea of what constitutes "unduly delayed" is
|
||||||
controlled by a set of kernel configuration variables and cpp macros:
|
controlled by a set of kernel configuration variables and cpp macros:
|
||||||
|
|
||||||
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
|
|||||||
And continues with the output of sched_show_task() for each
|
And continues with the output of sched_show_task() for each
|
||||||
task stalling the current RCU-tasks grace period.
|
task stalling the current RCU-tasks grace period.
|
||||||
|
|
||||||
|
|
||||||
|
Interpreting RCU's CPU Stall-Detector "Splats"
|
||||||
|
|
||||||
For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
|
For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
|
||||||
it will print a message similar to the following:
|
it will print a message similar to the following:
|
||||||
|
|
||||||
@@ -178,89 +274,3 @@ grace period is in flight.
|
|||||||
|
|
||||||
It is entirely possible to see stall warnings from normal and from
|
It is entirely possible to see stall warnings from normal and from
|
||||||
expedited grace periods at about the same time from the same run.
|
expedited grace periods at about the same time from the same run.
|
||||||
|
|
||||||
|
|
||||||
What Causes RCU CPU Stall Warnings?
|
|
||||||
|
|
||||||
So your kernel printed an RCU CPU stall warning. The next question is
|
|
||||||
"What caused it?" The following problems can result in RCU CPU stall
|
|
||||||
warnings:
|
|
||||||
|
|
||||||
o A CPU looping in an RCU read-side critical section.
|
|
||||||
|
|
||||||
o A CPU looping with interrupts disabled. This condition can
|
|
||||||
result in RCU-sched and RCU-bh stalls.
|
|
||||||
|
|
||||||
o A CPU looping with preemption disabled. This condition can
|
|
||||||
result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
|
|
||||||
stalls.
|
|
||||||
|
|
||||||
o A CPU looping with bottom halves disabled. This condition can
|
|
||||||
result in RCU-sched and RCU-bh stalls.
|
|
||||||
|
|
||||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
|
|
||||||
kernel without invoking schedule(). Note that cond_resched()
|
|
||||||
does not necessarily prevent RCU CPU stall warnings. Therefore,
|
|
||||||
if the looping in the kernel is really expected and desirable
|
|
||||||
behavior, you might need to replace some of the cond_resched()
|
|
||||||
calls with calls to cond_resched_rcu_qs().
|
|
||||||
|
|
||||||
o Booting Linux using a console connection that is too slow to
|
|
||||||
keep up with the boot-time console-message rate. For example,
|
|
||||||
a 115Kbaud serial console can be -way- too slow to keep up
|
|
||||||
with boot-time message rates, and will frequently result in
|
|
||||||
RCU CPU stall warning messages. Especially if you have added
|
|
||||||
debug printk()s.
|
|
||||||
|
|
||||||
o Anything that prevents RCU's grace-period kthreads from running.
|
|
||||||
This can result in the "All QSes seen" console-log message.
|
|
||||||
This message will include information on when the kthread last
|
|
||||||
ran and how often it should be expected to run.
|
|
||||||
|
|
||||||
o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
|
||||||
happen to preempt a low-priority task in the middle of an RCU
|
|
||||||
read-side critical section. This is especially damaging if
|
|
||||||
that low-priority task is not permitted to run on any other CPU,
|
|
||||||
in which case the next RCU grace period can never complete, which
|
|
||||||
will eventually cause the system to run out of memory and hang.
|
|
||||||
While the system is in the process of running itself out of
|
|
||||||
memory, you might see stall-warning messages.
|
|
||||||
|
|
||||||
o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
|
||||||
is running at a higher priority than the RCU softirq threads.
|
|
||||||
This will prevent RCU callbacks from ever being invoked,
|
|
||||||
and in a CONFIG_PREEMPT_RCU kernel will further prevent
|
|
||||||
RCU grace periods from ever completing. Either way, the
|
|
||||||
system will eventually run out of memory and hang. In the
|
|
||||||
CONFIG_PREEMPT_RCU case, you might see stall-warning
|
|
||||||
messages.
|
|
||||||
|
|
||||||
o A hardware or software issue shuts off the scheduler-clock
|
|
||||||
interrupt on a CPU that is not in dyntick-idle mode. This
|
|
||||||
problem really has happened, and seems to be most likely to
|
|
||||||
result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
|
|
||||||
|
|
||||||
o A bug in the RCU implementation.
|
|
||||||
|
|
||||||
o A hardware failure. This is quite unlikely, but has occurred
|
|
||||||
at least once in real life. A CPU failed in a running system,
|
|
||||||
becoming unresponsive, but not causing an immediate crash.
|
|
||||||
This resulted in a series of RCU CPU stall warnings, eventually
|
|
||||||
leading the realization that the CPU had failed.
|
|
||||||
|
|
||||||
The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
|
|
||||||
warning. Note that SRCU does -not- have CPU stall warnings. Please note
|
|
||||||
that RCU only detects CPU stalls when there is a grace period in progress.
|
|
||||||
No grace period, no CPU stall warnings.
|
|
||||||
|
|
||||||
To diagnose the cause of the stall, inspect the stack traces.
|
|
||||||
The offending function will usually be near the top of the stack.
|
|
||||||
If you have a series of stall warnings from a single extended stall,
|
|
||||||
comparing the stack traces can often help determine where the stall
|
|
||||||
is occurring, which will usually be in the function nearest the top of
|
|
||||||
that portion of the stack which remains the same from trace to trace.
|
|
||||||
If you can reliably trigger the stall, ftrace can be quite helpful.
|
|
||||||
|
|
||||||
RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
|
|
||||||
and with RCU's event tracing. For information on RCU's event tracing,
|
|
||||||
see include/trace/events/rcu.h.
|
|
||||||
|
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
|
|||||||
familiar locking primitives. Its overhead makes it a non-starter for
|
familiar locking primitives. Its overhead makes it a non-starter for
|
||||||
real-life use, as does its lack of scalability. It is also unsuitable
|
real-life use, as does its lack of scalability. It is also unsuitable
|
||||||
for realtime use, since it allows scheduling latency to "bleed" from
|
for realtime use, since it allows scheduling latency to "bleed" from
|
||||||
one read-side critical section to another.
|
one read-side critical section to another. It also assumes recursive
|
||||||
|
reader-writer locks: If you try this with non-recursive locks, and
|
||||||
|
you allow nested rcu_read_lock() calls, you can deadlock.
|
||||||
|
|
||||||
However, it is probably the easiest implementation to relate to, so is
|
However, it is probably the easiest implementation to relate to, so is
|
||||||
a good starting point.
|
a good starting point.
|
||||||
@@ -587,20 +589,21 @@ It is extremely simple:
|
|||||||
write_unlock(&rcu_gp_mutex);
|
write_unlock(&rcu_gp_mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without
|
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
||||||
missing much. But here they are anyway. And whatever you do, don't
|
much. But here are simplified versions anyway. And whatever you do,
|
||||||
forget about them when submitting patches making use of RCU!]
|
don't forget about them when submitting patches making use of RCU!]
|
||||||
|
|
||||||
#define rcu_assign_pointer(p, v) ({ \
|
#define rcu_assign_pointer(p, v) \
|
||||||
smp_wmb(); \
|
({ \
|
||||||
(p) = (v); \
|
smp_store_release(&(p), (v)); \
|
||||||
})
|
})
|
||||||
|
|
||||||
#define rcu_dereference(p) ({ \
|
#define rcu_dereference(p) \
|
||||||
typeof(p) _________p1 = p; \
|
({ \
|
||||||
smp_read_barrier_depends(); \
|
typeof(p) _________p1 = p; \
|
||||||
(_________p1); \
|
smp_read_barrier_depends(); \
|
||||||
})
|
(_________p1); \
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
|
The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
|
||||||
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face
|
|||||||
|
|
||||||
e. Is your workload too update-intensive for normal use of
|
e. Is your workload too update-intensive for normal use of
|
||||||
RCU, but inappropriate for other synchronization mechanisms?
|
RCU, but inappropriate for other synchronization mechanisms?
|
||||||
If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
|
If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
|
||||||
|
named SLAB_DESTROY_BY_RCU). But please be careful!
|
||||||
|
|
||||||
f. Do you need read-side critical sections that are respected
|
f. Do you need read-side critical sections that are respected
|
||||||
even though they are in the middle of the idle loop, during
|
even though they are in the middle of the idle loop, during
|
||||||
|
@@ -59,20 +59,28 @@ button driver uses the following 3 modes in order not to trigger issues.
|
|||||||
If the userspace hasn't been prepared to ignore the unreliable "opened"
|
If the userspace hasn't been prepared to ignore the unreliable "opened"
|
||||||
events and the unreliable initial state notification, Linux users can use
|
events and the unreliable initial state notification, Linux users can use
|
||||||
the following kernel parameters to handle the possible issues:
|
the following kernel parameters to handle the possible issues:
|
||||||
A. button.lid_init_state=open:
|
A. button.lid_init_state=method:
|
||||||
|
When this option is specified, the ACPI button driver reports the
|
||||||
|
initial lid state using the returning value of the _LID control method
|
||||||
|
and whether the "opened"/"closed" events are paired fully relies on the
|
||||||
|
firmware implementation.
|
||||||
|
This option can be used to fix some platforms where the returning value
|
||||||
|
of the _LID control method is reliable but the initial lid state
|
||||||
|
notification is missing.
|
||||||
|
This option is the default behavior during the period the userspace
|
||||||
|
isn't ready to handle the buggy AML tables.
|
||||||
|
B. button.lid_init_state=open:
|
||||||
When this option is specified, the ACPI button driver always reports the
|
When this option is specified, the ACPI button driver always reports the
|
||||||
initial lid state as "opened" and whether the "opened"/"closed" events
|
initial lid state as "opened" and whether the "opened"/"closed" events
|
||||||
are paired fully relies on the firmware implementation.
|
are paired fully relies on the firmware implementation.
|
||||||
This may fix some platforms where the returning value of the _LID
|
This may fix some platforms where the returning value of the _LID
|
||||||
control method is not reliable and the initial lid state notification is
|
control method is not reliable and the initial lid state notification is
|
||||||
missing.
|
missing.
|
||||||
This option is the default behavior during the period the userspace
|
|
||||||
isn't ready to handle the buggy AML tables.
|
|
||||||
|
|
||||||
If the userspace has been prepared to ignore the unreliable "opened" events
|
If the userspace has been prepared to ignore the unreliable "opened" events
|
||||||
and the unreliable initial state notification, Linux users should always
|
and the unreliable initial state notification, Linux users should always
|
||||||
use the following kernel parameter:
|
use the following kernel parameter:
|
||||||
B. button.lid_init_state=ignore:
|
C. button.lid_init_state=ignore:
|
||||||
When this option is specified, the ACPI button driver never reports the
|
When this option is specified, the ACPI button driver never reports the
|
||||||
initial lid state and there is a compensation mechanism implemented to
|
initial lid state and there is a compensation mechanism implemented to
|
||||||
ensure that the reliable "closed" notifications can always be delievered
|
ensure that the reliable "closed" notifications can always be delievered
|
||||||
|
@@ -15,7 +15,7 @@ kernel.
|
|||||||
CONFIG_ACPI_DEBUGGER=y
|
CONFIG_ACPI_DEBUGGER=y
|
||||||
CONFIG_ACPI_DEBUGGER_USER=m
|
CONFIG_ACPI_DEBUGGER_USER=m
|
||||||
|
|
||||||
The userspace utlities can be built from the kernel source tree using
|
The userspace utilities can be built from the kernel source tree using
|
||||||
the following commands:
|
the following commands:
|
||||||
|
|
||||||
$ cd tools
|
$ cd tools
|
||||||
|
162
Documentation/acpi/dsd/graph.txt
Normal file
162
Documentation/acpi/dsd/graph.txt
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
Graphs
|
||||||
|
|
||||||
|
|
||||||
|
_DSD
|
||||||
|
----
|
||||||
|
|
||||||
|
_DSD (Device Specific Data) [7] is a predefined ACPI device
|
||||||
|
configuration object that can be used to convey information on
|
||||||
|
hardware features which are not specifically covered by the ACPI
|
||||||
|
specification [1][6]. There are two _DSD extensions that are relevant
|
||||||
|
for graphs: property [4] and hierarchical data extensions [5]. The
|
||||||
|
property extension provides generic key-value pairs whereas the
|
||||||
|
hierarchical data extension supports nodes with references to other
|
||||||
|
nodes, forming a tree. The nodes in the tree may contain properties as
|
||||||
|
defined by the property extension. The two extensions together provide
|
||||||
|
a tree-like structure with zero or more properties (key-value pairs)
|
||||||
|
in each node of the tree.
|
||||||
|
|
||||||
|
The data structure may be accessed at runtime by using the device_*
|
||||||
|
and fwnode_* functions defined in include/linux/fwnode.h .
|
||||||
|
|
||||||
|
Fwnode represents a generic firmware node object. It is independent on
|
||||||
|
the firmware type. In ACPI, fwnodes are _DSD hierarchical data
|
||||||
|
extensions objects. A device's _DSD object is represented by an
|
||||||
|
fwnode.
|
||||||
|
|
||||||
|
The data structure may be referenced to elsewhere in the ACPI tables
|
||||||
|
by using a hard reference to the device itself and an index to the
|
||||||
|
hierarchical data extension array on each depth.
|
||||||
|
|
||||||
|
|
||||||
|
Ports and endpoints
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The port and endpoint concepts are very similar to those in Devicetree
|
||||||
|
[3]. A port represents an interface in a device, and an endpoint
|
||||||
|
represents a connection to that interface.
|
||||||
|
|
||||||
|
All port nodes are located under the device's "_DSD" node in the
|
||||||
|
hierarchical data extension tree. The property extension related to
|
||||||
|
each port node must contain the key "port" and an integer value which
|
||||||
|
is the number of the port. The object it refers to should be called "PRTX",
|
||||||
|
where "X" is the number of the port.
|
||||||
|
|
||||||
|
Further on, endpoints are located under the individual port nodes. The
|
||||||
|
first hierarchical data extension package list entry of the endpoint
|
||||||
|
nodes must begin with "endpoint" and must be followed by the number
|
||||||
|
of the endpoint. The object it refers to should be called "EPXY", where
|
||||||
|
"X" is the number of the port and "Y" is the number of the endpoint.
|
||||||
|
|
||||||
|
Each port node contains a property extension key "port", the value of
|
||||||
|
which is the number of the port node. The each endpoint is similarly numbered
|
||||||
|
with a property extension key "endpoint". Port numbers must be unique within a
|
||||||
|
device and endpoint numbers must be unique within a port.
|
||||||
|
|
||||||
|
The endpoint reference uses property extension with "remote-endpoint" property
|
||||||
|
name followed by a reference in the same package. Such references consist of the
|
||||||
|
the remote device reference, number of the port in the device and finally the
|
||||||
|
number of the endpoint in that port. Individual references thus appear as:
|
||||||
|
|
||||||
|
Package() { device, port_number, endpoint_number }
|
||||||
|
|
||||||
|
The references to endpoints must be always done both ways, to the
|
||||||
|
remote endpoint and back from the referred remote endpoint node.
|
||||||
|
|
||||||
|
A simple example of this is show below:
|
||||||
|
|
||||||
|
Scope (\_SB.PCI0.I2C2)
|
||||||
|
{
|
||||||
|
Device (CAM0)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "compatible", Package () { "nokia,smia" } },
|
||||||
|
},
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "port0", "PRT0" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (PRT0, Package() {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "port", 0 },
|
||||||
|
},
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "endpoint0", "EP00" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (EP00, Package() {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "endpoint", 0 },
|
||||||
|
Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, 4, 0 } },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Scope (\_SB.PCI0)
|
||||||
|
{
|
||||||
|
Device (ISP)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "port4", "PRT4" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
Name (PRT4, Package() {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "port", 4 }, /* CSI-2 port number */
|
||||||
|
},
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "endpoint0", "EP40" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
Name (EP40, Package() {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "endpoint", 0 },
|
||||||
|
Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, 0, 0 } },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Here, the port 0 of the "CAM0" device is connected to the port 4 of
|
||||||
|
the "ISP" device and vice versa.
|
||||||
|
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
|
||||||
|
[1] _DSD (Device Specific Data) Implementation Guide.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm>,
|
||||||
|
referenced 2016-10-03.
|
||||||
|
|
||||||
|
[2] Devicetree. <URL:http://www.devicetree.org>, referenced 2016-10-03.
|
||||||
|
|
||||||
|
[3] Documentation/devicetree/bindings/graph.txt
|
||||||
|
|
||||||
|
[4] Device Properties UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
||||||
|
referenced 2016-10-04.
|
||||||
|
|
||||||
|
[5] Hierarchical Data Extension UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.pdf>,
|
||||||
|
referenced 2016-10-04.
|
||||||
|
|
||||||
|
[6] Advanced Configuration and Power Interface Specification.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf>,
|
||||||
|
referenced 2016-10-04.
|
||||||
|
|
||||||
|
[7] _DSD Device Properties Usage Rules.
|
||||||
|
Documentation/acpi/DSD-properties-rules.txt
|
@@ -367,10 +367,10 @@ resulting child platform device.
|
|||||||
|
|
||||||
Device Tree namespace link device ID
|
Device Tree namespace link device ID
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
The Device Tree protocol uses device indentification based on the "compatible"
|
The Device Tree protocol uses device identification based on the "compatible"
|
||||||
property whose value is a string or an array of strings recognized as device
|
property whose value is a string or an array of strings recognized as device
|
||||||
identifiers by drivers and the driver core. The set of all those strings may be
|
identifiers by drivers and the driver core. The set of all those strings may be
|
||||||
regarded as a device indentification namespace analogous to the ACPI/PNP device
|
regarded as a device identification namespace analogous to the ACPI/PNP device
|
||||||
ID namespace. Consequently, in principle it should not be necessary to allocate
|
ID namespace. Consequently, in principle it should not be necessary to allocate
|
||||||
a new (and arguably redundant) ACPI/PNP device ID for a devices with an existing
|
a new (and arguably redundant) ACPI/PNP device ID for a devices with an existing
|
||||||
identification string in the Device Tree (DT) namespace, especially if that ID
|
identification string in the Device Tree (DT) namespace, especially if that ID
|
||||||
@@ -381,7 +381,7 @@ In ACPI, the device identification object called _CID (Compatible ID) is used to
|
|||||||
list the IDs of devices the given one is compatible with, but those IDs must
|
list the IDs of devices the given one is compatible with, but those IDs must
|
||||||
belong to one of the namespaces prescribed by the ACPI specification (see
|
belong to one of the namespaces prescribed by the ACPI specification (see
|
||||||
Section 6.1.2 of ACPI 6.0 for details) and the DT namespace is not one of them.
|
Section 6.1.2 of ACPI 6.0 for details) and the DT namespace is not one of them.
|
||||||
Moreover, the specification mandates that either a _HID or an _ADR identificaion
|
Moreover, the specification mandates that either a _HID or an _ADR identification
|
||||||
object be present for all ACPI objects representing devices (Section 6.1 of ACPI
|
object be present for all ACPI objects representing devices (Section 6.1 of ACPI
|
||||||
6.0). For non-enumerable bus types that object must be _HID and its value must
|
6.0). For non-enumerable bus types that object must be _HID and its value must
|
||||||
be a device ID from one of the namespaces prescribed by the specification too.
|
be a device ID from one of the namespaces prescribed by the specification too.
|
||||||
|
@@ -24,7 +24,7 @@ upstream.
|
|||||||
The homepage of ACPICA project is: www.acpica.org, it is maintained and
|
The homepage of ACPICA project is: www.acpica.org, it is maintained and
|
||||||
supported by Intel Corporation.
|
supported by Intel Corporation.
|
||||||
|
|
||||||
The following figure depicts the Linux ACPI subystem where the ACPICA
|
The following figure depicts the Linux ACPI subsystem where the ACPICA
|
||||||
adaptation is included:
|
adaptation is included:
|
||||||
|
|
||||||
+---------------------------------------------------------+
|
+---------------------------------------------------------+
|
||||||
@@ -110,7 +110,7 @@ upstream.
|
|||||||
Linux patches. The patches generated by this process are referred to as
|
Linux patches. The patches generated by this process are referred to as
|
||||||
"linuxized ACPICA patches". The release process is carried out on a local
|
"linuxized ACPICA patches". The release process is carried out on a local
|
||||||
copy the ACPICA git repository. Each commit in the monthly release is
|
copy the ACPICA git repository. Each commit in the monthly release is
|
||||||
converted into a linuxized ACPICA patch. Together, they form the montly
|
converted into a linuxized ACPICA patch. Together, they form the monthly
|
||||||
ACPICA release patchset for the Linux ACPI community. This process is
|
ACPICA release patchset for the Linux ACPI community. This process is
|
||||||
illustrated in the following figure:
|
illustrated in the following figure:
|
||||||
|
|
||||||
@@ -165,7 +165,7 @@ upstream.
|
|||||||
<http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git>.
|
<http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git>.
|
||||||
|
|
||||||
Before the linuxized ACPICA patches are sent to the Linux ACPI community
|
Before the linuxized ACPICA patches are sent to the Linux ACPI community
|
||||||
for review, there is a quality ensurance build test process to reduce
|
for review, there is a quality assurance build test process to reduce
|
||||||
porting issues. Currently this build process only takes care of the
|
porting issues. Currently this build process only takes care of the
|
||||||
following kernel configuration options:
|
following kernel configuration options:
|
||||||
CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER
|
CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER
|
||||||
@@ -195,12 +195,12 @@ upstream.
|
|||||||
release utilities (please refer to Section 4 below for the details).
|
release utilities (please refer to Section 4 below for the details).
|
||||||
3. Linux specific features - Sometimes it's impossible to use the
|
3. Linux specific features - Sometimes it's impossible to use the
|
||||||
current ACPICA APIs to implement features required by the Linux kernel,
|
current ACPICA APIs to implement features required by the Linux kernel,
|
||||||
so Linux developers occasionaly have to change ACPICA code directly.
|
so Linux developers occasionally have to change ACPICA code directly.
|
||||||
Those changes may not be acceptable by ACPICA upstream and in such cases
|
Those changes may not be acceptable by ACPICA upstream and in such cases
|
||||||
they are left as committed ACPICA divergences unless the ACPICA side can
|
they are left as committed ACPICA divergences unless the ACPICA side can
|
||||||
implement new mechanisms as replacements for them.
|
implement new mechanisms as replacements for them.
|
||||||
4. ACPICA release fixups - ACPICA only tests commits using a set of the
|
4. ACPICA release fixups - ACPICA only tests commits using a set of the
|
||||||
user space simulation utilies, thus the linuxized ACPICA patches may
|
user space simulation utilities, thus the linuxized ACPICA patches may
|
||||||
break the Linux kernel, leaving us build/boot failures. In order to
|
break the Linux kernel, leaving us build/boot failures. In order to
|
||||||
avoid breaking Linux bisection, fixes are applied directly to the
|
avoid breaking Linux bisection, fixes are applied directly to the
|
||||||
linuxized ACPICA patches during the release process. When the release
|
linuxized ACPICA patches during the release process. When the release
|
||||||
|
@@ -27,7 +27,7 @@ On what hardware does it run?
|
|||||||
today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
|
today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
|
||||||
UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
|
UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
|
||||||
IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
|
IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
|
||||||
Xtensa, Tilera TILE, AVR32, ARC and Renesas M32R architectures.
|
Xtensa, Tilera TILE, ARC and Renesas M32R architectures.
|
||||||
|
|
||||||
Linux is easily portable to most general-purpose 32- or 64-bit architectures
|
Linux is easily portable to most general-purpose 32- or 64-bit architectures
|
||||||
as long as they have a paged memory management unit (PMMU) and a port of the
|
as long as they have a paged memory management unit (PMMU) and a port of the
|
||||||
@@ -362,7 +362,7 @@ If something goes wrong
|
|||||||
as is, otherwise you will have to use the ``ksymoops`` program to make
|
as is, otherwise you will have to use the ``ksymoops`` program to make
|
||||||
sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
|
sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
|
||||||
This utility can be downloaded from
|
This utility can be downloaded from
|
||||||
ftp://ftp.<country>.kernel.org/pub/linux/utils/kernel/ksymoops/ .
|
https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
|
||||||
Alternatively, you can do the dump lookup by hand:
|
Alternatively, you can do the dump lookup by hand:
|
||||||
|
|
||||||
- In debugging dumps like the above, it helps enormously if you can
|
- In debugging dumps like the above, it helps enormously if you can
|
||||||
|
@@ -60,6 +60,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
mono
|
mono
|
||||||
java
|
java
|
||||||
ras
|
ras
|
||||||
|
pm/index
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
.. _kernelparameters:
|
||||||
|
|
||||||
The kernel's command-line parameters
|
The kernel's command-line parameters
|
||||||
====================================
|
====================================
|
||||||
|
|
||||||
@@ -86,7 +88,6 @@ parameter is applicable::
|
|||||||
APIC APIC support is enabled.
|
APIC APIC support is enabled.
|
||||||
APM Advanced Power Management support is enabled.
|
APM Advanced Power Management support is enabled.
|
||||||
ARM ARM architecture is enabled.
|
ARM ARM architecture is enabled.
|
||||||
AVR32 AVR32 architecture is enabled.
|
|
||||||
AX25 Appropriate AX.25 support is enabled.
|
AX25 Appropriate AX.25 support is enabled.
|
||||||
BLACKFIN Blackfin architecture is enabled.
|
BLACKFIN Blackfin architecture is enabled.
|
||||||
CLK Common clock infrastructure is enabled.
|
CLK Common clock infrastructure is enabled.
|
||||||
@@ -197,7 +198,7 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||||||
|
|
||||||
Finally, the [KMG] suffix is commonly described after a number of kernel
|
Finally, the [KMG] suffix is commonly described after a number of kernel
|
||||||
parameter values. These 'K', 'M', and 'G' letters represent the _binary_
|
parameter values. These 'K', 'M', and 'G' letters represent the _binary_
|
||||||
multipliers 'Kilo', 'Mega', and 'Giga', equalling 2^10, 2^20, and 2^30
|
multipliers 'Kilo', 'Mega', and 'Giga', equaling 2^10, 2^20, and 2^30
|
||||||
bytes respectively. Such letter suffixes can also be entirely omitted:
|
bytes respectively. Such letter suffixes can also be entirely omitted:
|
||||||
|
|
||||||
.. include:: kernel-parameters.txt
|
.. include:: kernel-parameters.txt
|
||||||
|
@@ -531,7 +531,6 @@
|
|||||||
[ACPI] acpi_pm
|
[ACPI] acpi_pm
|
||||||
[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
|
[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
|
||||||
pxa_timer,timer3,32k_counter,timer0_1
|
pxa_timer,timer3,32k_counter,timer0_1
|
||||||
[AVR32] avr32
|
|
||||||
[X86-32] pit,hpet,tsc;
|
[X86-32] pit,hpet,tsc;
|
||||||
scx200_hrt on Geode; cyclone on IBM x440
|
scx200_hrt on Geode; cyclone on IBM x440
|
||||||
[MIPS] MIPS
|
[MIPS] MIPS
|
||||||
@@ -867,6 +866,15 @@
|
|||||||
|
|
||||||
dscc4.setup= [NET]
|
dscc4.setup= [NET]
|
||||||
|
|
||||||
|
dt_cpu_ftrs= [PPC]
|
||||||
|
Format: {"off" | "known"}
|
||||||
|
Control how the dt_cpu_ftrs device-tree binding is
|
||||||
|
used for CPU feature discovery and setup (if it
|
||||||
|
exists).
|
||||||
|
off: Do not use it, fall back to legacy cpu table.
|
||||||
|
known: Do not pass through unknown features to guests
|
||||||
|
or userspace, only those that the kernel is aware of.
|
||||||
|
|
||||||
dump_apple_properties [X86]
|
dump_apple_properties [X86]
|
||||||
Dump name and content of EFI device properties on
|
Dump name and content of EFI device properties on
|
||||||
x86 Macs. Useful for driver authors to determine
|
x86 Macs. Useful for driver authors to determine
|
||||||
@@ -973,7 +981,7 @@
|
|||||||
A valid base address must be provided, and the serial
|
A valid base address must be provided, and the serial
|
||||||
port must already be setup and configured.
|
port must already be setup and configured.
|
||||||
|
|
||||||
armada3700_uart,<addr>
|
ar3700_uart,<addr>
|
||||||
Start an early, polled-mode console on the
|
Start an early, polled-mode console on the
|
||||||
Armada 3700 serial port at the specified
|
Armada 3700 serial port at the specified
|
||||||
address. The serial port must already be setup
|
address. The serial port must already be setup
|
||||||
@@ -989,6 +997,7 @@
|
|||||||
earlyprintk=ttySn[,baudrate]
|
earlyprintk=ttySn[,baudrate]
|
||||||
earlyprintk=dbgp[debugController#]
|
earlyprintk=dbgp[debugController#]
|
||||||
earlyprintk=pciserial,bus:device.function[,baudrate]
|
earlyprintk=pciserial,bus:device.function[,baudrate]
|
||||||
|
earlyprintk=xdbc[xhciController#]
|
||||||
|
|
||||||
earlyprintk is useful when the kernel crashes before
|
earlyprintk is useful when the kernel crashes before
|
||||||
the normal console is initialized. It is not enabled by
|
the normal console is initialized. It is not enabled by
|
||||||
@@ -1578,6 +1587,15 @@
|
|||||||
extended tables themselves, and also PASID support. With
|
extended tables themselves, and also PASID support. With
|
||||||
this option set, extended tables will not be used even
|
this option set, extended tables will not be used even
|
||||||
on hardware which claims to support them.
|
on hardware which claims to support them.
|
||||||
|
tboot_noforce [Default Off]
|
||||||
|
Do not force the Intel IOMMU enabled under tboot.
|
||||||
|
By default, tboot will force Intel IOMMU on, which
|
||||||
|
could harm performance of some high-throughput
|
||||||
|
devices like 40GBit network cards, even if identity
|
||||||
|
mapping is enabled.
|
||||||
|
Note that using this option lowers the security
|
||||||
|
provided by tboot because it makes the system
|
||||||
|
vulnerable to DMA attacks.
|
||||||
|
|
||||||
intel_idle.max_cstate= [KNL,HW,ACPI,X86]
|
intel_idle.max_cstate= [KNL,HW,ACPI,X86]
|
||||||
0 disables intel_idle and fall back on acpi_idle.
|
0 disables intel_idle and fall back on acpi_idle.
|
||||||
@@ -1644,6 +1662,12 @@
|
|||||||
nobypass [PPC/POWERNV]
|
nobypass [PPC/POWERNV]
|
||||||
Disable IOMMU bypass, using IOMMU for PCI devices.
|
Disable IOMMU bypass, using IOMMU for PCI devices.
|
||||||
|
|
||||||
|
iommu.passthrough=
|
||||||
|
[ARM64] Configure DMA to bypass the IOMMU by default.
|
||||||
|
Format: { "0" | "1" }
|
||||||
|
0 - Use IOMMU translation for DMA.
|
||||||
|
1 - Bypass the IOMMU for DMA.
|
||||||
|
unset - Use IOMMU translation for DMA.
|
||||||
|
|
||||||
io7= [HW] IO7 for Marvel based alpha systems
|
io7= [HW] IO7 for Marvel based alpha systems
|
||||||
See comment before marvel_specify_io7 in
|
See comment before marvel_specify_io7 in
|
||||||
@@ -2419,13 +2443,7 @@
|
|||||||
and gids from such clients. This is intended to ease
|
and gids from such clients. This is intended to ease
|
||||||
migration from NFSv2/v3.
|
migration from NFSv2/v3.
|
||||||
|
|
||||||
objlayoutdriver.osd_login_prog=
|
nmi_debug= [KNL,SH] Specify one or more actions to take
|
||||||
[NFS] [OBJLAYOUT] sets the pathname to the program which
|
|
||||||
is used to automatically discover and login into new
|
|
||||||
osd-targets. Please see:
|
|
||||||
Documentation/filesystems/pnfs.txt for more explanations
|
|
||||||
|
|
||||||
nmi_debug= [KNL,AVR32,SH] Specify one or more actions to take
|
|
||||||
when a NMI is triggered.
|
when a NMI is triggered.
|
||||||
Format: [state][,regs][,debounce][,die]
|
Format: [state][,regs][,debounce][,die]
|
||||||
|
|
||||||
@@ -3178,6 +3196,12 @@
|
|||||||
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
|
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
|
||||||
See Documentation/blockdev/ramdisk.txt.
|
See Documentation/blockdev/ramdisk.txt.
|
||||||
|
|
||||||
|
ras=option[,option,...] [KNL] RAS-specific options
|
||||||
|
|
||||||
|
cec_disable [X86]
|
||||||
|
Disable the Correctable Errors Collector,
|
||||||
|
see CONFIG_RAS_CEC help text.
|
||||||
|
|
||||||
rcu_nocbs= [KNL]
|
rcu_nocbs= [KNL]
|
||||||
The argument is a cpu list, as described above.
|
The argument is a cpu list, as described above.
|
||||||
|
|
||||||
@@ -3779,6 +3803,21 @@
|
|||||||
spia_pedr=
|
spia_pedr=
|
||||||
spia_peddr=
|
spia_peddr=
|
||||||
|
|
||||||
|
srcutree.exp_holdoff [KNL]
|
||||||
|
Specifies how many nanoseconds must elapse
|
||||||
|
since the end of the last SRCU grace period for
|
||||||
|
a given srcu_struct until the next normal SRCU
|
||||||
|
grace period will be considered for automatic
|
||||||
|
expediting. Set to zero to disable automatic
|
||||||
|
expediting.
|
||||||
|
|
||||||
|
stack_guard_gap= [MM]
|
||||||
|
override the default stack gap protection. The value
|
||||||
|
is in page units and it defines how many pages prior
|
||||||
|
to (for stacks growing down) resp. after (for stacks
|
||||||
|
growing up) the main stack are reserved for no other
|
||||||
|
mapping. Default value is 256 pages.
|
||||||
|
|
||||||
stacktrace [FTRACE]
|
stacktrace [FTRACE]
|
||||||
Enabled the stack tracer on boot up.
|
Enabled the stack tracer on boot up.
|
||||||
|
|
||||||
@@ -4121,6 +4160,9 @@
|
|||||||
usbhid.mousepoll=
|
usbhid.mousepoll=
|
||||||
[USBHID] The interval which mice are to be polled at.
|
[USBHID] The interval which mice are to be polled at.
|
||||||
|
|
||||||
|
usbhid.jspoll=
|
||||||
|
[USBHID] The interval which joysticks are to be polled at.
|
||||||
|
|
||||||
usb-storage.delay_use=
|
usb-storage.delay_use=
|
||||||
[UMS] The delay in seconds before a new device is
|
[UMS] The delay in seconds before a new device is
|
||||||
scanned for Logical Units (default 1).
|
scanned for Logical Units (default 1).
|
||||||
|
@@ -276,14 +276,14 @@ All md devices contain:
|
|||||||
array creation it will default to 0, though starting the array as
|
array creation it will default to 0, though starting the array as
|
||||||
``clean`` will set it much larger.
|
``clean`` will set it much larger.
|
||||||
|
|
||||||
new_dev
|
new_dev
|
||||||
This file can be written but not read. The value written should
|
This file can be written but not read. The value written should
|
||||||
be a block device number as major:minor. e.g. 8:0
|
be a block device number as major:minor. e.g. 8:0
|
||||||
This will cause that device to be attached to the array, if it is
|
This will cause that device to be attached to the array, if it is
|
||||||
available. It will then appear at md/dev-XXX (depending on the
|
available. It will then appear at md/dev-XXX (depending on the
|
||||||
name of the device) and further configuration is then possible.
|
name of the device) and further configuration is then possible.
|
||||||
|
|
||||||
safe_mode_delay
|
safe_mode_delay
|
||||||
When an md array has seen no write requests for a certain period
|
When an md array has seen no write requests for a certain period
|
||||||
of time, it will be marked as ``clean``. When another write
|
of time, it will be marked as ``clean``. When another write
|
||||||
request arrives, the array is marked as ``dirty`` before the write
|
request arrives, the array is marked as ``dirty`` before the write
|
||||||
@@ -292,7 +292,7 @@ All md devices contain:
|
|||||||
period as a number of seconds. The default is 200msec (0.200).
|
period as a number of seconds. The default is 200msec (0.200).
|
||||||
Writing a value of 0 disables safemode.
|
Writing a value of 0 disables safemode.
|
||||||
|
|
||||||
array_state
|
array_state
|
||||||
This file contains a single word which describes the current
|
This file contains a single word which describes the current
|
||||||
state of the array. In many cases, the state can be set by
|
state of the array. In many cases, the state can be set by
|
||||||
writing the word for the desired state, however some states
|
writing the word for the desired state, however some states
|
||||||
@@ -401,7 +401,30 @@ All md devices contain:
|
|||||||
once the array becomes non-degraded, and this fact has been
|
once the array becomes non-degraded, and this fact has been
|
||||||
recorded in the metadata.
|
recorded in the metadata.
|
||||||
|
|
||||||
|
consistency_policy
|
||||||
|
This indicates how the array maintains consistency in case of unexpected
|
||||||
|
shutdown. It can be:
|
||||||
|
|
||||||
|
none
|
||||||
|
Array has no redundancy information, e.g. raid0, linear.
|
||||||
|
|
||||||
|
resync
|
||||||
|
Full resync is performed and all redundancy is regenerated when the
|
||||||
|
array is started after unclean shutdown.
|
||||||
|
|
||||||
|
bitmap
|
||||||
|
Resync assisted by a write-intent bitmap.
|
||||||
|
|
||||||
|
journal
|
||||||
|
For raid4/5/6, journal device is used to log transactions and replay
|
||||||
|
after unclean shutdown.
|
||||||
|
|
||||||
|
ppl
|
||||||
|
For raid5 only, Partial Parity Log is used to close the write hole and
|
||||||
|
eliminate resync.
|
||||||
|
|
||||||
|
The accepted values when writing to this file are ``ppl`` and ``resync``,
|
||||||
|
used to enable and disable PPL.
|
||||||
|
|
||||||
|
|
||||||
As component devices are added to an md array, they appear in the ``md``
|
As component devices are added to an md array, they appear in the ``md``
|
||||||
@@ -563,6 +586,9 @@ Each directory contains:
|
|||||||
adds bad blocks without acknowledging them. This is largely
|
adds bad blocks without acknowledging them. This is largely
|
||||||
for testing.
|
for testing.
|
||||||
|
|
||||||
|
ppl_sector, ppl_size
|
||||||
|
Location and size (in sectors) of the space used for Partial Parity Log
|
||||||
|
on this device.
|
||||||
|
|
||||||
|
|
||||||
An active md device will also contain an entry for each active device
|
An active md device will also contain an entry for each active device
|
||||||
|
701
Documentation/admin-guide/pm/cpufreq.rst
Normal file
701
Documentation/admin-guide/pm/cpufreq.rst
Normal file
@@ -0,0 +1,701 @@
|
|||||||
|
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
||||||
|
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
||||||
|
|
||||||
|
=======================
|
||||||
|
CPU Performance Scaling
|
||||||
|
=======================
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
The Concept of CPU Performance Scaling
|
||||||
|
======================================
|
||||||
|
|
||||||
|
The majority of modern processors are capable of operating in a number of
|
||||||
|
different clock frequency and voltage configurations, often referred to as
|
||||||
|
Operating Performance Points or P-states (in ACPI terminology). As a rule,
|
||||||
|
the higher the clock frequency and the higher the voltage, the more instructions
|
||||||
|
can be retired by the CPU over a unit of time, but also the higher the clock
|
||||||
|
frequency and the higher the voltage, the more energy is consumed over a unit of
|
||||||
|
time (or the more power is drawn) by the CPU in the given P-state. Therefore
|
||||||
|
there is a natural tradeoff between the CPU capacity (the number of instructions
|
||||||
|
that can be executed over a unit of time) and the power drawn by the CPU.
|
||||||
|
|
||||||
|
In some situations it is desirable or even necessary to run the program as fast
|
||||||
|
as possible and then there is no reason to use any P-states different from the
|
||||||
|
highest one (i.e. the highest-performance frequency/voltage configuration
|
||||||
|
available). In some other cases, however, it may not be necessary to execute
|
||||||
|
instructions so quickly and maintaining the highest available CPU capacity for a
|
||||||
|
relatively long time without utilizing it entirely may be regarded as wasteful.
|
||||||
|
It also may not be physically possible to maintain maximum CPU capacity for too
|
||||||
|
long for thermal or power supply capacity reasons or similar. To cover those
|
||||||
|
cases, there are hardware interfaces allowing CPUs to be switched between
|
||||||
|
different frequency/voltage configurations or (in the ACPI terminology) to be
|
||||||
|
put into different P-states.
|
||||||
|
|
||||||
|
Typically, they are used along with algorithms to estimate the required CPU
|
||||||
|
capacity, so as to decide which P-states to put the CPUs into. Of course, since
|
||||||
|
the utilization of the system generally changes over time, that has to be done
|
||||||
|
repeatedly on a regular basis. The activity by which this happens is referred
|
||||||
|
to as CPU performance scaling or CPU frequency scaling (because it involves
|
||||||
|
adjusting the CPU clock frequency).
|
||||||
|
|
||||||
|
|
||||||
|
CPU Performance Scaling in Linux
|
||||||
|
================================
|
||||||
|
|
||||||
|
The Linux kernel supports CPU performance scaling by means of the ``CPUFreq``
|
||||||
|
(CPU Frequency scaling) subsystem that consists of three layers of code: the
|
||||||
|
core, scaling governors and scaling drivers.
|
||||||
|
|
||||||
|
The ``CPUFreq`` core provides the common code infrastructure and user space
|
||||||
|
interfaces for all platforms that support CPU performance scaling. It defines
|
||||||
|
the basic framework in which the other components operate.
|
||||||
|
|
||||||
|
Scaling governors implement algorithms to estimate the required CPU capacity.
|
||||||
|
As a rule, each governor implements one, possibly parametrized, scaling
|
||||||
|
algorithm.
|
||||||
|
|
||||||
|
Scaling drivers talk to the hardware. They provide scaling governors with
|
||||||
|
information on the available P-states (or P-state ranges in some cases) and
|
||||||
|
access platform-specific hardware interfaces to change CPU P-states as requested
|
||||||
|
by scaling governors.
|
||||||
|
|
||||||
|
In principle, all available scaling governors can be used with every scaling
|
||||||
|
driver. That design is based on the observation that the information used by
|
||||||
|
performance scaling algorithms for P-state selection can be represented in a
|
||||||
|
platform-independent form in the majority of cases, so it should be possible
|
||||||
|
to use the same performance scaling algorithm implemented in exactly the same
|
||||||
|
way regardless of which scaling driver is used. Consequently, the same set of
|
||||||
|
scaling governors should be suitable for every supported platform.
|
||||||
|
|
||||||
|
However, that observation may not hold for performance scaling algorithms
|
||||||
|
based on information provided by the hardware itself, for example through
|
||||||
|
feedback registers, as that information is typically specific to the hardware
|
||||||
|
interface it comes from and may not be easily represented in an abstract,
|
||||||
|
platform-independent way. For this reason, ``CPUFreq`` allows scaling drivers
|
||||||
|
to bypass the governor layer and implement their own performance scaling
|
||||||
|
algorithms. That is done by the |intel_pstate| scaling driver.
|
||||||
|
|
||||||
|
|
||||||
|
``CPUFreq`` Policy Objects
|
||||||
|
==========================
|
||||||
|
|
||||||
|
In some cases the hardware interface for P-state control is shared by multiple
|
||||||
|
CPUs. That is, for example, the same register (or set of registers) is used to
|
||||||
|
control the P-state of multiple CPUs at the same time and writing to it affects
|
||||||
|
all of those CPUs simultaneously.
|
||||||
|
|
||||||
|
Sets of CPUs sharing hardware P-state control interfaces are represented by
|
||||||
|
``CPUFreq`` as |struct cpufreq_policy| objects. For consistency,
|
||||||
|
|struct cpufreq_policy| is also used when there is only one CPU in the given
|
||||||
|
set.
|
||||||
|
|
||||||
|
The ``CPUFreq`` core maintains a pointer to a |struct cpufreq_policy| object for
|
||||||
|
every CPU in the system, including CPUs that are currently offline. If multiple
|
||||||
|
CPUs share the same hardware P-state control interface, all of the pointers
|
||||||
|
corresponding to them point to the same |struct cpufreq_policy| object.
|
||||||
|
|
||||||
|
``CPUFreq`` uses |struct cpufreq_policy| as its basic data type and the design
|
||||||
|
of its user space interface is based on the policy concept.
|
||||||
|
|
||||||
|
|
||||||
|
CPU Initialization
|
||||||
|
==================
|
||||||
|
|
||||||
|
First of all, a scaling driver has to be registered for ``CPUFreq`` to work.
|
||||||
|
It is only possible to register one scaling driver at a time, so the scaling
|
||||||
|
driver is expected to be able to handle all CPUs in the system.
|
||||||
|
|
||||||
|
The scaling driver may be registered before or after CPU registration. If
|
||||||
|
CPUs are registered earlier, the driver core invokes the ``CPUFreq`` core to
|
||||||
|
take a note of all of the already registered CPUs during the registration of the
|
||||||
|
scaling driver. In turn, if any CPUs are registered after the registration of
|
||||||
|
the scaling driver, the ``CPUFreq`` core will be invoked to take note of them
|
||||||
|
at their registration time.
|
||||||
|
|
||||||
|
In any case, the ``CPUFreq`` core is invoked to take note of any logical CPU it
|
||||||
|
has not seen so far as soon as it is ready to handle that CPU. [Note that the
|
||||||
|
logical CPU may be a physical single-core processor, or a single core in a
|
||||||
|
multicore processor, or a hardware thread in a physical processor or processor
|
||||||
|
core. In what follows "CPU" always means "logical CPU" unless explicitly stated
|
||||||
|
otherwise and the word "processor" is used to refer to the physical part
|
||||||
|
possibly including multiple logical CPUs.]
|
||||||
|
|
||||||
|
Once invoked, the ``CPUFreq`` core checks if the policy pointer is already set
|
||||||
|
for the given CPU and if so, it skips the policy object creation. Otherwise,
|
||||||
|
a new policy object is created and initialized, which involves the creation of
|
||||||
|
a new policy directory in ``sysfs``, and the policy pointer corresponding to
|
||||||
|
the given CPU is set to the new policy object's address in memory.
|
||||||
|
|
||||||
|
Next, the scaling driver's ``->init()`` callback is invoked with the policy
|
||||||
|
pointer of the new CPU passed to it as the argument. That callback is expected
|
||||||
|
to initialize the performance scaling hardware interface for the given CPU (or,
|
||||||
|
more precisely, for the set of CPUs sharing the hardware interface it belongs
|
||||||
|
to, represented by its policy object) and, if the policy object it has been
|
||||||
|
called for is new, to set parameters of the policy, like the minimum and maximum
|
||||||
|
frequencies supported by the hardware, the table of available frequencies (if
|
||||||
|
the set of supported P-states is not a continuous range), and the mask of CPUs
|
||||||
|
that belong to the same policy (including both online and offline CPUs). That
|
||||||
|
mask is then used by the core to populate the policy pointers for all of the
|
||||||
|
CPUs in it.
|
||||||
|
|
||||||
|
The next major initialization step for a new policy object is to attach a
|
||||||
|
scaling governor to it (to begin with, that is the default scaling governor
|
||||||
|
determined by the kernel configuration, but it may be changed later
|
||||||
|
via ``sysfs``). First, a pointer to the new policy object is passed to the
|
||||||
|
governor's ``->init()`` callback which is expected to initialize all of the
|
||||||
|
data structures necessary to handle the given policy and, possibly, to add
|
||||||
|
a governor ``sysfs`` interface to it. Next, the governor is started by
|
||||||
|
invoking its ``->start()`` callback.
|
||||||
|
|
||||||
|
That callback it expected to register per-CPU utilization update callbacks for
|
||||||
|
all of the online CPUs belonging to the given policy with the CPU scheduler.
|
||||||
|
The utilization update callbacks will be invoked by the CPU scheduler on
|
||||||
|
important events, like task enqueue and dequeue, on every iteration of the
|
||||||
|
scheduler tick or generally whenever the CPU utilization may change (from the
|
||||||
|
scheduler's perspective). They are expected to carry out computations needed
|
||||||
|
to determine the P-state to use for the given policy going forward and to
|
||||||
|
invoke the scaling driver to make changes to the hardware in accordance with
|
||||||
|
the P-state selection. The scaling driver may be invoked directly from
|
||||||
|
scheduler context or asynchronously, via a kernel thread or workqueue, depending
|
||||||
|
on the configuration and capabilities of the scaling driver and the governor.
|
||||||
|
|
||||||
|
Similar steps are taken for policy objects that are not new, but were "inactive"
|
||||||
|
previously, meaning that all of the CPUs belonging to them were offline. The
|
||||||
|
only practical difference in that case is that the ``CPUFreq`` core will attempt
|
||||||
|
to use the scaling governor previously used with the policy that became
|
||||||
|
"inactive" (and is re-initialized now) instead of the default governor.
|
||||||
|
|
||||||
|
In turn, if a previously offline CPU is being brought back online, but some
|
||||||
|
other CPUs sharing the policy object with it are online already, there is no
|
||||||
|
need to re-initialize the policy object at all. In that case, it only is
|
||||||
|
necessary to restart the scaling governor so that it can take the new online CPU
|
||||||
|
into account. That is achieved by invoking the governor's ``->stop`` and
|
||||||
|
``->start()`` callbacks, in this order, for the entire policy.
|
||||||
|
|
||||||
|
As mentioned before, the |intel_pstate| scaling driver bypasses the scaling
|
||||||
|
governor layer of ``CPUFreq`` and provides its own P-state selection algorithms.
|
||||||
|
Consequently, if |intel_pstate| is used, scaling governors are not attached to
|
||||||
|
new policy objects. Instead, the driver's ``->setpolicy()`` callback is invoked
|
||||||
|
to register per-CPU utilization update callbacks for each policy. These
|
||||||
|
callbacks are invoked by the CPU scheduler in the same way as for scaling
|
||||||
|
governors, but in the |intel_pstate| case they both determine the P-state to
|
||||||
|
use and change the hardware configuration accordingly in one go from scheduler
|
||||||
|
context.
|
||||||
|
|
||||||
|
The policy objects created during CPU initialization and other data structures
|
||||||
|
associated with them are torn down when the scaling driver is unregistered
|
||||||
|
(which happens when the kernel module containing it is unloaded, for example) or
|
||||||
|
when the last CPU belonging to the given policy in unregistered.
|
||||||
|
|
||||||
|
|
||||||
|
Policy Interface in ``sysfs``
|
||||||
|
=============================
|
||||||
|
|
||||||
|
During the initialization of the kernel, the ``CPUFreq`` core creates a
|
||||||
|
``sysfs`` directory (kobject) called ``cpufreq`` under
|
||||||
|
:file:`/sys/devices/system/cpu/`.
|
||||||
|
|
||||||
|
That directory contains a ``policyX`` subdirectory (where ``X`` represents an
|
||||||
|
integer number) for every policy object maintained by the ``CPUFreq`` core.
|
||||||
|
Each ``policyX`` directory is pointed to by ``cpufreq`` symbolic links
|
||||||
|
under :file:`/sys/devices/system/cpu/cpuY/` (where ``Y`` represents an integer
|
||||||
|
that may be different from the one represented by ``X``) for all of the CPUs
|
||||||
|
associated with (or belonging to) the given policy. The ``policyX`` directories
|
||||||
|
in :file:`/sys/devices/system/cpu/cpufreq` each contain policy-specific
|
||||||
|
attributes (files) to control ``CPUFreq`` behavior for the corresponding policy
|
||||||
|
objects (that is, for all of the CPUs associated with them).
|
||||||
|
|
||||||
|
Some of those attributes are generic. They are created by the ``CPUFreq`` core
|
||||||
|
and their behavior generally does not depend on what scaling driver is in use
|
||||||
|
and what scaling governor is attached to the given policy. Some scaling drivers
|
||||||
|
also add driver-specific attributes to the policy directories in ``sysfs`` to
|
||||||
|
control policy-specific aspects of driver behavior.
|
||||||
|
|
||||||
|
The generic attributes under :file:`/sys/devices/system/cpu/cpufreq/policyX/`
|
||||||
|
are the following:
|
||||||
|
|
||||||
|
``affected_cpus``
|
||||||
|
List of online CPUs belonging to this policy (i.e. sharing the hardware
|
||||||
|
performance scaling interface represented by the ``policyX`` policy
|
||||||
|
object).
|
||||||
|
|
||||||
|
``bios_limit``
|
||||||
|
If the platform firmware (BIOS) tells the OS to apply an upper limit to
|
||||||
|
CPU frequencies, that limit will be reported through this attribute (if
|
||||||
|
present).
|
||||||
|
|
||||||
|
The existence of the limit may be a result of some (often unintentional)
|
||||||
|
BIOS settings, restrictions coming from a service processor or another
|
||||||
|
BIOS/HW-based mechanisms.
|
||||||
|
|
||||||
|
This does not cover ACPI thermal limitations which can be discovered
|
||||||
|
through a generic thermal driver.
|
||||||
|
|
||||||
|
This attribute is not present if the scaling driver in use does not
|
||||||
|
support it.
|
||||||
|
|
||||||
|
``cpuinfo_max_freq``
|
||||||
|
Maximum possible operating frequency the CPUs belonging to this policy
|
||||||
|
can run at (in kHz).
|
||||||
|
|
||||||
|
``cpuinfo_min_freq``
|
||||||
|
Minimum possible operating frequency the CPUs belonging to this policy
|
||||||
|
can run at (in kHz).
|
||||||
|
|
||||||
|
``cpuinfo_transition_latency``
|
||||||
|
The time it takes to switch the CPUs belonging to this policy from one
|
||||||
|
P-state to another, in nanoseconds.
|
||||||
|
|
||||||
|
If unknown or if known to be so high that the scaling driver does not
|
||||||
|
work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`)
|
||||||
|
will be returned by reads from this attribute.
|
||||||
|
|
||||||
|
``related_cpus``
|
||||||
|
List of all (online and offline) CPUs belonging to this policy.
|
||||||
|
|
||||||
|
``scaling_available_governors``
|
||||||
|
List of ``CPUFreq`` scaling governors present in the kernel that can
|
||||||
|
be attached to this policy or (if the |intel_pstate| scaling driver is
|
||||||
|
in use) list of scaling algorithms provided by the driver that can be
|
||||||
|
applied to this policy.
|
||||||
|
|
||||||
|
[Note that some governors are modular and it may be necessary to load a
|
||||||
|
kernel module for the governor held by it to become available and be
|
||||||
|
listed by this attribute.]
|
||||||
|
|
||||||
|
``scaling_cur_freq``
|
||||||
|
Current frequency of all of the CPUs belonging to this policy (in kHz).
|
||||||
|
|
||||||
|
For the majority of scaling drivers, this is the frequency of the last
|
||||||
|
P-state requested by the driver from the hardware using the scaling
|
||||||
|
interface provided by it, which may or may not reflect the frequency
|
||||||
|
the CPU is actually running at (due to hardware design and other
|
||||||
|
limitations).
|
||||||
|
|
||||||
|
Some scaling drivers (e.g. |intel_pstate|) attempt to provide
|
||||||
|
information more precisely reflecting the current CPU frequency through
|
||||||
|
this attribute, but that still may not be the exact current CPU
|
||||||
|
frequency as seen by the hardware at the moment.
|
||||||
|
|
||||||
|
``scaling_driver``
|
||||||
|
The scaling driver currently in use.
|
||||||
|
|
||||||
|
``scaling_governor``
|
||||||
|
The scaling governor currently attached to this policy or (if the
|
||||||
|
|intel_pstate| scaling driver is in use) the scaling algorithm
|
||||||
|
provided by the driver that is currently applied to this policy.
|
||||||
|
|
||||||
|
This attribute is read-write and writing to it will cause a new scaling
|
||||||
|
governor to be attached to this policy or a new scaling algorithm
|
||||||
|
provided by the scaling driver to be applied to it (in the
|
||||||
|
|intel_pstate| case), as indicated by the string written to this
|
||||||
|
attribute (which must be one of the names listed by the
|
||||||
|
``scaling_available_governors`` attribute described above).
|
||||||
|
|
||||||
|
``scaling_max_freq``
|
||||||
|
Maximum frequency the CPUs belonging to this policy are allowed to be
|
||||||
|
running at (in kHz).
|
||||||
|
|
||||||
|
This attribute is read-write and writing a string representing an
|
||||||
|
integer to it will cause a new limit to be set (it must not be lower
|
||||||
|
than the value of the ``scaling_min_freq`` attribute).
|
||||||
|
|
||||||
|
``scaling_min_freq``
|
||||||
|
Minimum frequency the CPUs belonging to this policy are allowed to be
|
||||||
|
running at (in kHz).
|
||||||
|
|
||||||
|
This attribute is read-write and writing a string representing a
|
||||||
|
non-negative integer to it will cause a new limit to be set (it must not
|
||||||
|
be higher than the value of the ``scaling_max_freq`` attribute).
|
||||||
|
|
||||||
|
``scaling_setspeed``
|
||||||
|
This attribute is functional only if the `userspace`_ scaling governor
|
||||||
|
is attached to the given policy.
|
||||||
|
|
||||||
|
It returns the last frequency requested by the governor (in kHz) or can
|
||||||
|
be written to in order to set a new frequency for the policy.
|
||||||
|
|
||||||
|
|
||||||
|
Generic Scaling Governors
|
||||||
|
=========================
|
||||||
|
|
||||||
|
``CPUFreq`` provides generic scaling governors that can be used with all
|
||||||
|
scaling drivers. As stated before, each of them implements a single, possibly
|
||||||
|
parametrized, performance scaling algorithm.
|
||||||
|
|
||||||
|
Scaling governors are attached to policy objects and different policy objects
|
||||||
|
can be handled by different scaling governors at the same time (although that
|
||||||
|
may lead to suboptimal results in some cases).
|
||||||
|
|
||||||
|
The scaling governor for a given policy object can be changed at any time with
|
||||||
|
the help of the ``scaling_governor`` policy attribute in ``sysfs``.
|
||||||
|
|
||||||
|
Some governors expose ``sysfs`` attributes to control or fine-tune the scaling
|
||||||
|
algorithms implemented by them. Those attributes, referred to as governor
|
||||||
|
tunables, can be either global (system-wide) or per-policy, depending on the
|
||||||
|
scaling driver in use. If the driver requires governor tunables to be
|
||||||
|
per-policy, they are located in a subdirectory of each policy directory.
|
||||||
|
Otherwise, they are located in a subdirectory under
|
||||||
|
:file:`/sys/devices/system/cpu/cpufreq/`. In either case the name of the
|
||||||
|
subdirectory containing the governor tunables is the name of the governor
|
||||||
|
providing them.
|
||||||
|
|
||||||
|
``performance``
|
||||||
|
---------------
|
||||||
|
|
||||||
|
When attached to a policy object, this governor causes the highest frequency,
|
||||||
|
within the ``scaling_max_freq`` policy limit, to be requested for that policy.
|
||||||
|
|
||||||
|
The request is made once at that time the governor for the policy is set to
|
||||||
|
``performance`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq``
|
||||||
|
policy limits change after that.
|
||||||
|
|
||||||
|
``powersave``
|
||||||
|
-------------
|
||||||
|
|
||||||
|
When attached to a policy object, this governor causes the lowest frequency,
|
||||||
|
within the ``scaling_min_freq`` policy limit, to be requested for that policy.
|
||||||
|
|
||||||
|
The request is made once at that time the governor for the policy is set to
|
||||||
|
``powersave`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq``
|
||||||
|
policy limits change after that.
|
||||||
|
|
||||||
|
``userspace``
|
||||||
|
-------------
|
||||||
|
|
||||||
|
This governor does not do anything by itself. Instead, it allows user space
|
||||||
|
to set the CPU frequency for the policy it is attached to by writing to the
|
||||||
|
``scaling_setspeed`` attribute of that policy.
|
||||||
|
|
||||||
|
``schedutil``
|
||||||
|
-------------
|
||||||
|
|
||||||
|
This governor uses CPU utilization data available from the CPU scheduler. It
|
||||||
|
generally is regarded as a part of the CPU scheduler, so it can access the
|
||||||
|
scheduler's internal data structures directly.
|
||||||
|
|
||||||
|
It runs entirely in scheduler context, although in some cases it may need to
|
||||||
|
invoke the scaling driver asynchronously when it decides that the CPU frequency
|
||||||
|
should be changed for a given policy (that depends on whether or not the driver
|
||||||
|
is capable of changing the CPU frequency from scheduler context).
|
||||||
|
|
||||||
|
The actions of this governor for a particular CPU depend on the scheduling class
|
||||||
|
invoking its utilization update callback for that CPU. If it is invoked by the
|
||||||
|
RT or deadline scheduling classes, the governor will increase the frequency to
|
||||||
|
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
||||||
|
if it is invoked by the CFS scheduling class, the governor will use the
|
||||||
|
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
||||||
|
given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_
|
||||||
|
LWN.net article for a description of the PELT mechanism). Then, the new
|
||||||
|
CPU frequency to apply is computed in accordance with the formula
|
||||||
|
|
||||||
|
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
||||||
|
|
||||||
|
where ``util`` is the PELT number, ``max`` is the theoretical maximum of
|
||||||
|
``util``, and ``f_0`` is either the maximum possible CPU frequency for the given
|
||||||
|
policy (if the PELT number is frequency-invariant), or the current CPU frequency
|
||||||
|
(otherwise).
|
||||||
|
|
||||||
|
This governor also employs a mechanism allowing it to temporarily bump up the
|
||||||
|
CPU frequency for tasks that have been waiting on I/O most recently, called
|
||||||
|
"IO-wait boosting". That happens when the :c:macro:`SCHED_CPUFREQ_IOWAIT` flag
|
||||||
|
is passed by the scheduler to the governor callback which causes the frequency
|
||||||
|
to go up to the allowed maximum immediately and then draw back to the value
|
||||||
|
returned by the above formula over time.
|
||||||
|
|
||||||
|
This governor exposes only one tunable:
|
||||||
|
|
||||||
|
``rate_limit_us``
|
||||||
|
Minimum time (in microseconds) that has to pass between two consecutive
|
||||||
|
runs of governor computations (default: 1000 times the scaling driver's
|
||||||
|
transition latency).
|
||||||
|
|
||||||
|
The purpose of this tunable is to reduce the scheduler context overhead
|
||||||
|
of the governor which might be excessive without it.
|
||||||
|
|
||||||
|
This governor generally is regarded as a replacement for the older `ondemand`_
|
||||||
|
and `conservative`_ governors (described below), as it is simpler and more
|
||||||
|
tightly integrated with the CPU scheduler, its overhead in terms of CPU context
|
||||||
|
switches and similar is less significant, and it uses the scheduler's own CPU
|
||||||
|
utilization metric, so in principle its decisions should not contradict the
|
||||||
|
decisions made by the other parts of the scheduler.
|
||||||
|
|
||||||
|
``ondemand``
|
||||||
|
------------
|
||||||
|
|
||||||
|
This governor uses CPU load as a CPU frequency selection metric.
|
||||||
|
|
||||||
|
In order to estimate the current CPU load, it measures the time elapsed between
|
||||||
|
consecutive invocations of its worker routine and computes the fraction of that
|
||||||
|
time in which the given CPU was not idle. The ratio of the non-idle (active)
|
||||||
|
time to the total CPU time is taken as an estimate of the load.
|
||||||
|
|
||||||
|
If this governor is attached to a policy shared by multiple CPUs, the load is
|
||||||
|
estimated for all of them and the greatest result is taken as the load estimate
|
||||||
|
for the entire policy.
|
||||||
|
|
||||||
|
The worker routine of this governor has to run in process context, so it is
|
||||||
|
invoked asynchronously (via a workqueue) and CPU P-states are updated from
|
||||||
|
there if necessary. As a result, the scheduler context overhead from this
|
||||||
|
governor is minimum, but it causes additional CPU context switches to happen
|
||||||
|
relatively often and the CPU P-state updates triggered by it can be relatively
|
||||||
|
irregular. Also, it affects its own CPU load metric by running code that
|
||||||
|
reduces the CPU idle time (even though the CPU idle time is only reduced very
|
||||||
|
slightly by it).
|
||||||
|
|
||||||
|
It generally selects CPU frequencies proportional to the estimated load, so that
|
||||||
|
the value of the ``cpuinfo_max_freq`` policy attribute corresponds to the load of
|
||||||
|
1 (or 100%), and the value of the ``cpuinfo_min_freq`` policy attribute
|
||||||
|
corresponds to the load of 0, unless when the load exceeds a (configurable)
|
||||||
|
speedup threshold, in which case it will go straight for the highest frequency
|
||||||
|
it is allowed to use (the ``scaling_max_freq`` policy limit).
|
||||||
|
|
||||||
|
This governor exposes the following tunables:
|
||||||
|
|
||||||
|
``sampling_rate``
|
||||||
|
This is how often the governor's worker routine should run, in
|
||||||
|
microseconds.
|
||||||
|
|
||||||
|
Typically, it is set to values of the order of 10000 (10 ms). Its
|
||||||
|
default value is equal to the value of ``cpuinfo_transition_latency``
|
||||||
|
for each policy this governor is attached to (but since the unit here
|
||||||
|
is greater by 1000, this means that the time represented by
|
||||||
|
``sampling_rate`` is 1000 times greater than the transition latency by
|
||||||
|
default).
|
||||||
|
|
||||||
|
If this tunable is per-policy, the following shell command sets the time
|
||||||
|
represented by it to be 750 times as high as the transition latency::
|
||||||
|
|
||||||
|
# echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
|
||||||
|
|
||||||
|
|
||||||
|
``min_sampling_rate``
|
||||||
|
The minimum value of ``sampling_rate``.
|
||||||
|
|
||||||
|
Equal to 10000 (10 ms) if :c:macro:`CONFIG_NO_HZ_COMMON` and
|
||||||
|
:c:data:`tick_nohz_active` are both set or to 20 times the value of
|
||||||
|
:c:data:`jiffies` in microseconds otherwise.
|
||||||
|
|
||||||
|
``up_threshold``
|
||||||
|
If the estimated CPU load is above this value (in percent), the governor
|
||||||
|
will set the frequency to the maximum value allowed for the policy.
|
||||||
|
Otherwise, the selected frequency will be proportional to the estimated
|
||||||
|
CPU load.
|
||||||
|
|
||||||
|
``ignore_nice_load``
|
||||||
|
If set to 1 (default 0), it will cause the CPU load estimation code to
|
||||||
|
treat the CPU time spent on executing tasks with "nice" levels greater
|
||||||
|
than 0 as CPU idle time.
|
||||||
|
|
||||||
|
This may be useful if there are tasks in the system that should not be
|
||||||
|
taken into account when deciding what frequency to run the CPUs at.
|
||||||
|
Then, to make that happen it is sufficient to increase the "nice" level
|
||||||
|
of those tasks above 0 and set this attribute to 1.
|
||||||
|
|
||||||
|
``sampling_down_factor``
|
||||||
|
Temporary multiplier, between 1 (default) and 100 inclusive, to apply to
|
||||||
|
the ``sampling_rate`` value if the CPU load goes above ``up_threshold``.
|
||||||
|
|
||||||
|
This causes the next execution of the governor's worker routine (after
|
||||||
|
setting the frequency to the allowed maximum) to be delayed, so the
|
||||||
|
frequency stays at the maximum level for a longer time.
|
||||||
|
|
||||||
|
Frequency fluctuations in some bursty workloads may be avoided this way
|
||||||
|
at the cost of additional energy spent on maintaining the maximum CPU
|
||||||
|
capacity.
|
||||||
|
|
||||||
|
``powersave_bias``
|
||||||
|
Reduction factor to apply to the original frequency target of the
|
||||||
|
governor (including the maximum value used when the ``up_threshold``
|
||||||
|
value is exceeded by the estimated CPU load) or sensitivity threshold
|
||||||
|
for the AMD frequency sensitivity powersave bias driver
|
||||||
|
(:file:`drivers/cpufreq/amd_freq_sensitivity.c`), between 0 and 1000
|
||||||
|
inclusive.
|
||||||
|
|
||||||
|
If the AMD frequency sensitivity powersave bias driver is not loaded,
|
||||||
|
the effective frequency to apply is given by
|
||||||
|
|
||||||
|
f * (1 - ``powersave_bias`` / 1000)
|
||||||
|
|
||||||
|
where f is the governor's original frequency target. The default value
|
||||||
|
of this attribute is 0 in that case.
|
||||||
|
|
||||||
|
If the AMD frequency sensitivity powersave bias driver is loaded, the
|
||||||
|
value of this attribute is 400 by default and it is used in a different
|
||||||
|
way.
|
||||||
|
|
||||||
|
On Family 16h (and later) AMD processors there is a mechanism to get a
|
||||||
|
measured workload sensitivity, between 0 and 100% inclusive, from the
|
||||||
|
hardware. That value can be used to estimate how the performance of the
|
||||||
|
workload running on a CPU will change in response to frequency changes.
|
||||||
|
|
||||||
|
The performance of a workload with the sensitivity of 0 (memory-bound or
|
||||||
|
IO-bound) is not expected to increase at all as a result of increasing
|
||||||
|
the CPU frequency, whereas workloads with the sensitivity of 100%
|
||||||
|
(CPU-bound) are expected to perform much better if the CPU frequency is
|
||||||
|
increased.
|
||||||
|
|
||||||
|
If the workload sensitivity is less than the threshold represented by
|
||||||
|
the ``powersave_bias`` value, the sensitivity powersave bias driver
|
||||||
|
will cause the governor to select a frequency lower than its original
|
||||||
|
target, so as to avoid over-provisioning workloads that will not benefit
|
||||||
|
from running at higher CPU frequencies.
|
||||||
|
|
||||||
|
``conservative``
|
||||||
|
----------------
|
||||||
|
|
||||||
|
This governor uses CPU load as a CPU frequency selection metric.
|
||||||
|
|
||||||
|
It estimates the CPU load in the same way as the `ondemand`_ governor described
|
||||||
|
above, but the CPU frequency selection algorithm implemented by it is different.
|
||||||
|
|
||||||
|
Namely, it avoids changing the frequency significantly over short time intervals
|
||||||
|
which may not be suitable for systems with limited power supply capacity (e.g.
|
||||||
|
battery-powered). To achieve that, it changes the frequency in relatively
|
||||||
|
small steps, one step at a time, up or down - depending on whether or not a
|
||||||
|
(configurable) threshold has been exceeded by the estimated CPU load.
|
||||||
|
|
||||||
|
This governor exposes the following tunables:
|
||||||
|
|
||||||
|
``freq_step``
|
||||||
|
Frequency step in percent of the maximum frequency the governor is
|
||||||
|
allowed to set (the ``scaling_max_freq`` policy limit), between 0 and
|
||||||
|
100 (5 by default).
|
||||||
|
|
||||||
|
This is how much the frequency is allowed to change in one go. Setting
|
||||||
|
it to 0 will cause the default frequency step (5 percent) to be used
|
||||||
|
and setting it to 100 effectively causes the governor to periodically
|
||||||
|
switch the frequency between the ``scaling_min_freq`` and
|
||||||
|
``scaling_max_freq`` policy limits.
|
||||||
|
|
||||||
|
``down_threshold``
|
||||||
|
Threshold value (in percent, 20 by default) used to determine the
|
||||||
|
frequency change direction.
|
||||||
|
|
||||||
|
If the estimated CPU load is greater than this value, the frequency will
|
||||||
|
go up (by ``freq_step``). If the load is less than this value (and the
|
||||||
|
``sampling_down_factor`` mechanism is not in effect), the frequency will
|
||||||
|
go down. Otherwise, the frequency will not be changed.
|
||||||
|
|
||||||
|
``sampling_down_factor``
|
||||||
|
Frequency decrease deferral factor, between 1 (default) and 10
|
||||||
|
inclusive.
|
||||||
|
|
||||||
|
It effectively causes the frequency to go down ``sampling_down_factor``
|
||||||
|
times slower than it ramps up.
|
||||||
|
|
||||||
|
|
||||||
|
Frequency Boost Support
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Background
|
||||||
|
----------
|
||||||
|
|
||||||
|
Some processors support a mechanism to raise the operating frequency of some
|
||||||
|
cores in a multicore package temporarily (and above the sustainable frequency
|
||||||
|
threshold for the whole package) under certain conditions, for example if the
|
||||||
|
whole chip is not fully utilized and below its intended thermal or power budget.
|
||||||
|
|
||||||
|
Different names are used by different vendors to refer to this functionality.
|
||||||
|
For Intel processors it is referred to as "Turbo Boost", AMD calls it
|
||||||
|
"Turbo-Core" or (in technical documentation) "Core Performance Boost" and so on.
|
||||||
|
As a rule, it also is implemented differently by different vendors. The simple
|
||||||
|
term "frequency boost" is used here for brevity to refer to all of those
|
||||||
|
implementations.
|
||||||
|
|
||||||
|
The frequency boost mechanism may be either hardware-based or software-based.
|
||||||
|
If it is hardware-based (e.g. on x86), the decision to trigger the boosting is
|
||||||
|
made by the hardware (although in general it requires the hardware to be put
|
||||||
|
into a special state in which it can control the CPU frequency within certain
|
||||||
|
limits). If it is software-based (e.g. on ARM), the scaling driver decides
|
||||||
|
whether or not to trigger boosting and when to do that.
|
||||||
|
|
||||||
|
The ``boost`` File in ``sysfs``
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls
|
||||||
|
the "boost" setting for the whole system. It is not present if the underlying
|
||||||
|
scaling driver does not support the frequency boost mechanism (or supports it,
|
||||||
|
but provides a driver-specific interface for controlling it, like
|
||||||
|
|intel_pstate|).
|
||||||
|
|
||||||
|
If the value in this file is 1, the frequency boost mechanism is enabled. This
|
||||||
|
means that either the hardware can be put into states in which it is able to
|
||||||
|
trigger boosting (in the hardware-based case), or the software is allowed to
|
||||||
|
trigger boosting (in the software-based case). It does not mean that boosting
|
||||||
|
is actually in use at the moment on any CPUs in the system. It only means a
|
||||||
|
permission to use the frequency boost mechanism (which still may never be used
|
||||||
|
for other reasons).
|
||||||
|
|
||||||
|
If the value in this file is 0, the frequency boost mechanism is disabled and
|
||||||
|
cannot be used at all.
|
||||||
|
|
||||||
|
The only values that can be written to this file are 0 and 1.
|
||||||
|
|
||||||
|
Rationale for Boost Control Knob
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
The frequency boost mechanism is generally intended to help to achieve optimum
|
||||||
|
CPU performance on time scales below software resolution (e.g. below the
|
||||||
|
scheduler tick interval) and it is demonstrably suitable for many workloads, but
|
||||||
|
it may lead to problems in certain situations.
|
||||||
|
|
||||||
|
For this reason, many systems make it possible to disable the frequency boost
|
||||||
|
mechanism in the platform firmware (BIOS) setup, but that requires the system to
|
||||||
|
be restarted for the setting to be adjusted as desired, which may not be
|
||||||
|
practical at least in some cases. For example:
|
||||||
|
|
||||||
|
1. Boosting means overclocking the processor, although under controlled
|
||||||
|
conditions. Generally, the processor's energy consumption increases
|
||||||
|
as a result of increasing its frequency and voltage, even temporarily.
|
||||||
|
That may not be desirable on systems that switch to power sources of
|
||||||
|
limited capacity, such as batteries, so the ability to disable the boost
|
||||||
|
mechanism while the system is running may help there (but that depends on
|
||||||
|
the workload too).
|
||||||
|
|
||||||
|
2. In some situations deterministic behavior is more important than
|
||||||
|
performance or energy consumption (or both) and the ability to disable
|
||||||
|
boosting while the system is running may be useful then.
|
||||||
|
|
||||||
|
3. To examine the impact of the frequency boost mechanism itself, it is useful
|
||||||
|
to be able to run tests with and without boosting, preferably without
|
||||||
|
restarting the system in the meantime.
|
||||||
|
|
||||||
|
4. Reproducible results are important when running benchmarks. Since
|
||||||
|
the boosting functionality depends on the load of the whole package,
|
||||||
|
single-thread performance may vary because of it which may lead to
|
||||||
|
unreproducible results sometimes. That can be avoided by disabling the
|
||||||
|
frequency boost mechanism before running benchmarks sensitive to that
|
||||||
|
issue.
|
||||||
|
|
||||||
|
Legacy AMD ``cpb`` Knob
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The AMD powernow-k8 scaling driver supports a ``sysfs`` knob very similar to
|
||||||
|
the global ``boost`` one. It is used for disabling/enabling the "Core
|
||||||
|
Performance Boost" feature of some AMD processors.
|
||||||
|
|
||||||
|
If present, that knob is located in every ``CPUFreq`` policy directory in
|
||||||
|
``sysfs`` (:file:`/sys/devices/system/cpu/cpufreq/policyX/`) and is called
|
||||||
|
``cpb``, which indicates a more fine grained control interface. The actual
|
||||||
|
implementation, however, works on the system-wide basis and setting that knob
|
||||||
|
for one policy causes the same value of it to be set for all of the other
|
||||||
|
policies at the same time.
|
||||||
|
|
||||||
|
That knob is still supported on AMD processors that support its underlying
|
||||||
|
hardware feature, but it may be configured out of the kernel (via the
|
||||||
|
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option) and the global
|
||||||
|
``boost`` knob is present regardless. Thus it is always possible use the
|
||||||
|
``boost`` knob instead of the ``cpb`` one which is highly recommended, as that
|
||||||
|
is more consistent with what all of the other systems do (and the ``cpb`` knob
|
||||||
|
may not be supported any more in the future).
|
||||||
|
|
||||||
|
The ``cpb`` knob is never present for any processors without the underlying
|
||||||
|
hardware feature (e.g. all Intel ones), even if the
|
||||||
|
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
||||||
|
|
||||||
|
|
||||||
|
.. _Per-entity load tracking: https://lwn.net/Articles/531853/
|
16
Documentation/admin-guide/pm/index.rst
Normal file
16
Documentation/admin-guide/pm/index.rst
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
================
|
||||||
|
Power Management
|
||||||
|
================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
cpufreq
|
||||||
|
intel_pstate
|
||||||
|
|
||||||
|
.. only:: subproject and html
|
||||||
|
|
||||||
|
Indices
|
||||||
|
=======
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
755
Documentation/admin-guide/pm/intel_pstate.rst
Normal file
755
Documentation/admin-guide/pm/intel_pstate.rst
Normal file
@@ -0,0 +1,755 @@
|
|||||||
|
===============================================
|
||||||
|
``intel_pstate`` CPU Performance Scaling Driver
|
||||||
|
===============================================
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
|
General Information
|
||||||
|
===================
|
||||||
|
|
||||||
|
``intel_pstate`` is a part of the
|
||||||
|
:doc:`CPU performance scaling subsystem <cpufreq>` in the Linux kernel
|
||||||
|
(``CPUFreq``). It is a scaling driver for the Sandy Bridge and later
|
||||||
|
generations of Intel processors. Note, however, that some of those processors
|
||||||
|
may not be supported. [To understand ``intel_pstate`` it is necessary to know
|
||||||
|
how ``CPUFreq`` works in general, so this is the time to read :doc:`cpufreq` if
|
||||||
|
you have not done that yet.]
|
||||||
|
|
||||||
|
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
||||||
|
than just an operating frequency or an operating performance point (see the
|
||||||
|
`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
|
||||||
|
information about that). For this reason, the representation of P-states used
|
||||||
|
by ``intel_pstate`` internally follows the hardware specification (for details
|
||||||
|
refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
||||||
|
Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
|
||||||
|
uses frequencies for identifying operating performance points of CPUs and
|
||||||
|
frequencies are involved in the user space interface exposed by it, so
|
||||||
|
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
||||||
|
(fortunately, that mapping is unambiguous). At the same time, it would not be
|
||||||
|
practical for ``intel_pstate`` to supply the ``CPUFreq`` core with a table of
|
||||||
|
available frequencies due to the possible size of it, so the driver does not do
|
||||||
|
that. Some functionality of the core is limited by that.
|
||||||
|
|
||||||
|
Since the hardware P-state selection interface used by ``intel_pstate`` is
|
||||||
|
available at the logical CPU level, the driver always works with individual
|
||||||
|
CPUs. Consequently, if ``intel_pstate`` is in use, every ``CPUFreq`` policy
|
||||||
|
object corresponds to one logical CPU and ``CPUFreq`` policies are effectively
|
||||||
|
equivalent to CPUs. In particular, this means that they become "inactive" every
|
||||||
|
time the corresponding CPU is taken offline and need to be re-initialized when
|
||||||
|
it goes back online.
|
||||||
|
|
||||||
|
``intel_pstate`` is not modular, so it cannot be unloaded, which means that the
|
||||||
|
only way to pass early-configuration-time parameters to it is via the kernel
|
||||||
|
command line. However, its configuration can be adjusted via ``sysfs`` to a
|
||||||
|
great extent. In some configurations it even is possible to unregister it via
|
||||||
|
``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and
|
||||||
|
registered (see `below <status_attr_>`_).
|
||||||
|
|
||||||
|
|
||||||
|
Operation Modes
|
||||||
|
===============
|
||||||
|
|
||||||
|
``intel_pstate`` can operate in three different modes: in the active mode with
|
||||||
|
or without hardware-managed P-states support and in the passive mode. Which of
|
||||||
|
them will be in effect depends on what kernel command line options are used and
|
||||||
|
on the capabilities of the processor.
|
||||||
|
|
||||||
|
Active Mode
|
||||||
|
-----------
|
||||||
|
|
||||||
|
This is the default operation mode of ``intel_pstate``. If it works in this
|
||||||
|
mode, the ``scaling_driver`` policy attribute in ``sysfs`` for all ``CPUFreq``
|
||||||
|
policies contains the string "intel_pstate".
|
||||||
|
|
||||||
|
In this mode the driver bypasses the scaling governors layer of ``CPUFreq`` and
|
||||||
|
provides its own scaling algorithms for P-state selection. Those algorithms
|
||||||
|
can be applied to ``CPUFreq`` policies in the same way as generic scaling
|
||||||
|
governors (that is, through the ``scaling_governor`` policy attribute in
|
||||||
|
``sysfs``). [Note that different P-state selection algorithms may be chosen for
|
||||||
|
different policies, but that is not recommended.]
|
||||||
|
|
||||||
|
They are not generic scaling governors, but their names are the same as the
|
||||||
|
names of some of those governors. Moreover, confusingly enough, they generally
|
||||||
|
do not work in the same way as the generic governors they share the names with.
|
||||||
|
For example, the ``powersave`` P-state selection algorithm provided by
|
||||||
|
``intel_pstate`` is not a counterpart of the generic ``powersave`` governor
|
||||||
|
(roughly, it corresponds to the ``schedutil`` and ``ondemand`` governors).
|
||||||
|
|
||||||
|
There are two P-state selection algorithms provided by ``intel_pstate`` in the
|
||||||
|
active mode: ``powersave`` and ``performance``. The way they both operate
|
||||||
|
depends on whether or not the hardware-managed P-states (HWP) feature has been
|
||||||
|
enabled in the processor and possibly on the processor model.
|
||||||
|
|
||||||
|
Which of the P-state selection algorithms is used by default depends on the
|
||||||
|
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option.
|
||||||
|
Namely, if that option is set, the ``performance`` algorithm will be used by
|
||||||
|
default, and the other one will be used by default if it is not set.
|
||||||
|
|
||||||
|
Active Mode With HWP
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
If the processor supports the HWP feature, it will be enabled during the
|
||||||
|
processor initialization and cannot be disabled after that. It is possible
|
||||||
|
to avoid enabling it by passing the ``intel_pstate=no_hwp`` argument to the
|
||||||
|
kernel in the command line.
|
||||||
|
|
||||||
|
If the HWP feature has been enabled, ``intel_pstate`` relies on the processor to
|
||||||
|
select P-states by itself, but still it can give hints to the processor's
|
||||||
|
internal P-state selection logic. What those hints are depends on which P-state
|
||||||
|
selection algorithm has been applied to the given policy (or to the CPU it
|
||||||
|
corresponds to).
|
||||||
|
|
||||||
|
Even though the P-state selection is carried out by the processor automatically,
|
||||||
|
``intel_pstate`` registers utilization update callbacks with the CPU scheduler
|
||||||
|
in this mode. However, they are not used for running a P-state selection
|
||||||
|
algorithm, but for periodic updates of the current CPU frequency information to
|
||||||
|
be made available from the ``scaling_cur_freq`` policy attribute in ``sysfs``.
|
||||||
|
|
||||||
|
HWP + ``performance``
|
||||||
|
.....................
|
||||||
|
|
||||||
|
In this configuration ``intel_pstate`` will write 0 to the processor's
|
||||||
|
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||||
|
Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
|
||||||
|
internal P-state selection logic is expected to focus entirely on performance.
|
||||||
|
|
||||||
|
This will override the EPP/EPB setting coming from the ``sysfs`` interface
|
||||||
|
(see `Energy vs Performance Hints`_ below).
|
||||||
|
|
||||||
|
Also, in this configuration the range of P-states available to the processor's
|
||||||
|
internal P-state selection logic is always restricted to the upper boundary
|
||||||
|
(that is, the maximum P-state that the driver is allowed to use).
|
||||||
|
|
||||||
|
HWP + ``powersave``
|
||||||
|
...................
|
||||||
|
|
||||||
|
In this configuration ``intel_pstate`` will set the processor's
|
||||||
|
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||||
|
Energy-Performance Bias (EPB) knob (otherwise) to whatever value it was
|
||||||
|
previously set to via ``sysfs`` (or whatever default value it was
|
||||||
|
set to by the platform firmware). This usually causes the processor's
|
||||||
|
internal P-state selection logic to be less performance-focused.
|
||||||
|
|
||||||
|
Active Mode Without HWP
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This is the default operation mode for processors that do not support the HWP
|
||||||
|
feature. It also is used by default with the ``intel_pstate=no_hwp`` argument
|
||||||
|
in the kernel command line. However, in this mode ``intel_pstate`` may refuse
|
||||||
|
to work with the given processor if it does not recognize it. [Note that
|
||||||
|
``intel_pstate`` will never refuse to work with any processor with the HWP
|
||||||
|
feature enabled.]
|
||||||
|
|
||||||
|
In this mode ``intel_pstate`` registers utilization update callbacks with the
|
||||||
|
CPU scheduler in order to run a P-state selection algorithm, either
|
||||||
|
``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy
|
||||||
|
setting in ``sysfs``. The current CPU frequency information to be made
|
||||||
|
available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is
|
||||||
|
periodically updated by those utilization update callbacks too.
|
||||||
|
|
||||||
|
``performance``
|
||||||
|
...............
|
||||||
|
|
||||||
|
Without HWP, this P-state selection algorithm is always the same regardless of
|
||||||
|
the processor model and platform configuration.
|
||||||
|
|
||||||
|
It selects the maximum P-state it is allowed to use, subject to limits set via
|
||||||
|
``sysfs``, every time the P-state selection computations are carried out by the
|
||||||
|
driver's utilization update callback for the given CPU (that does not happen
|
||||||
|
more often than every 10 ms), but the hardware configuration will not be changed
|
||||||
|
if the new P-state is the same as the current one.
|
||||||
|
|
||||||
|
This is the default P-state selection algorithm if the
|
||||||
|
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
|
||||||
|
is set.
|
||||||
|
|
||||||
|
``powersave``
|
||||||
|
.............
|
||||||
|
|
||||||
|
Without HWP, this P-state selection algorithm generally depends on the
|
||||||
|
processor model and/or the system profile setting in the ACPI tables and there
|
||||||
|
are two variants of it.
|
||||||
|
|
||||||
|
One of them is used with processors from the Atom line and (regardless of the
|
||||||
|
processor model) on platforms with the system profile in the ACPI tables set to
|
||||||
|
"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
|
||||||
|
"workstation". It is also used with processors supporting the HWP feature if
|
||||||
|
that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
|
||||||
|
argument in the kernel command line). It is similar to the algorithm
|
||||||
|
implemented by the generic ``schedutil`` scaling governor except that the
|
||||||
|
utilization metric used by it is based on numbers coming from feedback
|
||||||
|
registers of the CPU. It generally selects P-states proportional to the
|
||||||
|
current CPU utilization, so it is referred to as the "proportional" algorithm.
|
||||||
|
|
||||||
|
The second variant of the ``powersave`` P-state selection algorithm, used in all
|
||||||
|
of the other cases (generally, on processors from the Core line, so it is
|
||||||
|
referred to as the "Core" algorithm), is based on the values read from the APERF
|
||||||
|
and MPERF feedback registers and the previously requested target P-state.
|
||||||
|
It does not really take CPU utilization into account explicitly, but as a rule
|
||||||
|
it causes the CPU P-state to ramp up very quickly in response to increased
|
||||||
|
utilization which is generally desirable in server environments.
|
||||||
|
|
||||||
|
Regardless of the variant, this algorithm is run by the driver's utilization
|
||||||
|
update callback for the given CPU when it is invoked by the CPU scheduler, but
|
||||||
|
not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
|
||||||
|
particular case <Tuning Interface in debugfs_>`_). Like in the ``performance``
|
||||||
|
case, the hardware configuration is not touched if the new P-state turns out to
|
||||||
|
be the same as the current one.
|
||||||
|
|
||||||
|
This is the default P-state selection algorithm if the
|
||||||
|
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
|
||||||
|
is not set.
|
||||||
|
|
||||||
|
Passive Mode
|
||||||
|
------------
|
||||||
|
|
||||||
|
This mode is used if the ``intel_pstate=passive`` argument is passed to the
|
||||||
|
kernel in the command line (it implies the ``intel_pstate=no_hwp`` setting too).
|
||||||
|
Like in the active mode without HWP support, in this mode ``intel_pstate`` may
|
||||||
|
refuse to work with the given processor if it does not recognize it.
|
||||||
|
|
||||||
|
If the driver works in this mode, the ``scaling_driver`` policy attribute in
|
||||||
|
``sysfs`` for all ``CPUFreq`` policies contains the string "intel_cpufreq".
|
||||||
|
Then, the driver behaves like a regular ``CPUFreq`` scaling driver. That is,
|
||||||
|
it is invoked by generic scaling governors when necessary to talk to the
|
||||||
|
hardware in order to change the P-state of a CPU (in particular, the
|
||||||
|
``schedutil`` governor can invoke it directly from scheduler context).
|
||||||
|
|
||||||
|
While in this mode, ``intel_pstate`` can be used with all of the (generic)
|
||||||
|
scaling governors listed by the ``scaling_available_governors`` policy attribute
|
||||||
|
in ``sysfs`` (and the P-state selection algorithms described above are not
|
||||||
|
used). Then, it is responsible for the configuration of policy objects
|
||||||
|
corresponding to CPUs and provides the ``CPUFreq`` core (and the scaling
|
||||||
|
governors attached to the policy objects) with accurate information on the
|
||||||
|
maximum and minimum operating frequencies supported by the hardware (including
|
||||||
|
the so-called "turbo" frequency ranges). In other words, in the passive mode
|
||||||
|
the entire range of available P-states is exposed by ``intel_pstate`` to the
|
||||||
|
``CPUFreq`` core. However, in this mode the driver does not register
|
||||||
|
utilization update callbacks with the CPU scheduler and the ``scaling_cur_freq``
|
||||||
|
information comes from the ``CPUFreq`` core (and is the last frequency selected
|
||||||
|
by the current scaling governor for the given policy).
|
||||||
|
|
||||||
|
|
||||||
|
.. _turbo:
|
||||||
|
|
||||||
|
Turbo P-states Support
|
||||||
|
======================
|
||||||
|
|
||||||
|
In the majority of cases, the entire range of P-states available to
|
||||||
|
``intel_pstate`` can be divided into two sub-ranges that correspond to
|
||||||
|
different types of processor behavior, above and below a boundary that
|
||||||
|
will be referred to as the "turbo threshold" in what follows.
|
||||||
|
|
||||||
|
The P-states above the turbo threshold are referred to as "turbo P-states" and
|
||||||
|
the whole sub-range of P-states they belong to is referred to as the "turbo
|
||||||
|
range". These names are related to the Turbo Boost technology allowing a
|
||||||
|
multicore processor to opportunistically increase the P-state of one or more
|
||||||
|
cores if there is enough power to do that and if that is not going to cause the
|
||||||
|
thermal envelope of the processor package to be exceeded.
|
||||||
|
|
||||||
|
Specifically, if software sets the P-state of a CPU core within the turbo range
|
||||||
|
(that is, above the turbo threshold), the processor is permitted to take over
|
||||||
|
performance scaling control for that core and put it into turbo P-states of its
|
||||||
|
choice going forward. However, that permission is interpreted differently by
|
||||||
|
different processor generations. Namely, the Sandy Bridge generation of
|
||||||
|
processors will never use any P-states above the last one set by software for
|
||||||
|
the given core, even if it is within the turbo range, whereas all of the later
|
||||||
|
processor generations will take it as a license to use any P-states from the
|
||||||
|
turbo range, even above the one set by software. In other words, on those
|
||||||
|
processors setting any P-state from the turbo range will enable the processor
|
||||||
|
to put the given core into all turbo P-states up to and including the maximum
|
||||||
|
supported one as it sees fit.
|
||||||
|
|
||||||
|
One important property of turbo P-states is that they are not sustainable. More
|
||||||
|
precisely, there is no guarantee that any CPUs will be able to stay in any of
|
||||||
|
those states indefinitely, because the power distribution within the processor
|
||||||
|
package may change over time or the thermal envelope it was designed for might
|
||||||
|
be exceeded if a turbo P-state was used for too long.
|
||||||
|
|
||||||
|
In turn, the P-states below the turbo threshold generally are sustainable. In
|
||||||
|
fact, if one of them is set by software, the processor is not expected to change
|
||||||
|
it to a lower one unless in a thermal stress or a power limit violation
|
||||||
|
situation (a higher P-state may still be used if it is set for another CPU in
|
||||||
|
the same package at the same time, for example).
|
||||||
|
|
||||||
|
Some processors allow multiple cores to be in turbo P-states at the same time,
|
||||||
|
but the maximum P-state that can be set for them generally depends on the number
|
||||||
|
of cores running concurrently. The maximum turbo P-state that can be set for 3
|
||||||
|
cores at the same time usually is lower than the analogous maximum P-state for
|
||||||
|
2 cores, which in turn usually is lower than the maximum turbo P-state that can
|
||||||
|
be set for 1 core. The one-core maximum turbo P-state is thus the maximum
|
||||||
|
supported one overall.
|
||||||
|
|
||||||
|
The maximum supported turbo P-state, the turbo threshold (the maximum supported
|
||||||
|
non-turbo P-state) and the minimum supported P-state are specific to the
|
||||||
|
processor model and can be determined by reading the processor's model-specific
|
||||||
|
registers (MSRs). Moreover, some processors support the Configurable TDP
|
||||||
|
(Thermal Design Power) feature and, when that feature is enabled, the turbo
|
||||||
|
threshold effectively becomes a configurable value that can be set by the
|
||||||
|
platform firmware.
|
||||||
|
|
||||||
|
Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes
|
||||||
|
the entire range of available P-states, including the whole turbo range, to the
|
||||||
|
``CPUFreq`` core and (in the passive mode) to generic scaling governors. This
|
||||||
|
generally causes turbo P-states to be set more often when ``intel_pstate`` is
|
||||||
|
used relative to ACPI-based CPU performance scaling (see `below <acpi-cpufreq_>`_
|
||||||
|
for more information).
|
||||||
|
|
||||||
|
Moreover, since ``intel_pstate`` always knows what the real turbo threshold is
|
||||||
|
(even if the Configurable TDP feature is enabled in the processor), its
|
||||||
|
``no_turbo`` attribute in ``sysfs`` (described `below <no_turbo_attr_>`_) should
|
||||||
|
work as expected in all cases (that is, if set to disable turbo P-states, it
|
||||||
|
always should prevent ``intel_pstate`` from using them).
|
||||||
|
|
||||||
|
|
||||||
|
Processor Support
|
||||||
|
=================
|
||||||
|
|
||||||
|
To handle a given processor ``intel_pstate`` requires a number of different
|
||||||
|
pieces of information on it to be known, including:
|
||||||
|
|
||||||
|
* The minimum supported P-state.
|
||||||
|
|
||||||
|
* The maximum supported `non-turbo P-state <turbo_>`_.
|
||||||
|
|
||||||
|
* Whether or not turbo P-states are supported at all.
|
||||||
|
|
||||||
|
* The maximum supported `one-core turbo P-state <turbo_>`_ (if turbo P-states
|
||||||
|
are supported).
|
||||||
|
|
||||||
|
* The scaling formula to translate the driver's internal representation
|
||||||
|
of P-states into frequencies and the other way around.
|
||||||
|
|
||||||
|
Generally, ways to obtain that information are specific to the processor model
|
||||||
|
or family. Although it often is possible to obtain all of it from the processor
|
||||||
|
itself (using model-specific registers), there are cases in which hardware
|
||||||
|
manuals need to be consulted to get to it too.
|
||||||
|
|
||||||
|
For this reason, there is a list of supported processors in ``intel_pstate`` and
|
||||||
|
the driver initialization will fail if the detected processor is not in that
|
||||||
|
list, unless it supports the `HWP feature <Active Mode_>`_. [The interface to
|
||||||
|
obtain all of the information listed above is the same for all of the processors
|
||||||
|
supporting the HWP feature, which is why they all are supported by
|
||||||
|
``intel_pstate``.]
|
||||||
|
|
||||||
|
|
||||||
|
User Space Interface in ``sysfs``
|
||||||
|
=================================
|
||||||
|
|
||||||
|
Global Attributes
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to
|
||||||
|
control its functionality at the system level. They are located in the
|
||||||
|
``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all
|
||||||
|
CPUs.
|
||||||
|
|
||||||
|
Some of them are not present if the ``intel_pstate=per_cpu_perf_limits``
|
||||||
|
argument is passed to the kernel in the command line.
|
||||||
|
|
||||||
|
``max_perf_pct``
|
||||||
|
Maximum P-state the driver is allowed to set in percent of the
|
||||||
|
maximum supported performance level (the highest supported `turbo
|
||||||
|
P-state <turbo_>`_).
|
||||||
|
|
||||||
|
This attribute will not be exposed if the
|
||||||
|
``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
|
||||||
|
command line.
|
||||||
|
|
||||||
|
``min_perf_pct``
|
||||||
|
Minimum P-state the driver is allowed to set in percent of the
|
||||||
|
maximum supported performance level (the highest supported `turbo
|
||||||
|
P-state <turbo_>`_).
|
||||||
|
|
||||||
|
This attribute will not be exposed if the
|
||||||
|
``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
|
||||||
|
command line.
|
||||||
|
|
||||||
|
``num_pstates``
|
||||||
|
Number of P-states supported by the processor (between 0 and 255
|
||||||
|
inclusive) including both turbo and non-turbo P-states (see
|
||||||
|
`Turbo P-states Support`_).
|
||||||
|
|
||||||
|
The value of this attribute is not affected by the ``no_turbo``
|
||||||
|
setting described `below <no_turbo_attr_>`_.
|
||||||
|
|
||||||
|
This attribute is read-only.
|
||||||
|
|
||||||
|
``turbo_pct``
|
||||||
|
Ratio of the `turbo range <turbo_>`_ size to the size of the entire
|
||||||
|
range of supported P-states, in percent.
|
||||||
|
|
||||||
|
This attribute is read-only.
|
||||||
|
|
||||||
|
.. _no_turbo_attr:
|
||||||
|
|
||||||
|
``no_turbo``
|
||||||
|
If set (equal to 1), the driver is not allowed to set any turbo P-states
|
||||||
|
(see `Turbo P-states Support`_). If unset (equalt to 0, which is the
|
||||||
|
default), turbo P-states can be set by the driver.
|
||||||
|
[Note that ``intel_pstate`` does not support the general ``boost``
|
||||||
|
attribute (supported by some other scaling drivers) which is replaced
|
||||||
|
by this one.]
|
||||||
|
|
||||||
|
This attrubute does not affect the maximum supported frequency value
|
||||||
|
supplied to the ``CPUFreq`` core and exposed via the policy interface,
|
||||||
|
but it affects the maximum possible value of per-policy P-state limits
|
||||||
|
(see `Interpretation of Policy Attributes`_ below for details).
|
||||||
|
|
||||||
|
.. _status_attr:
|
||||||
|
|
||||||
|
``status``
|
||||||
|
Operation mode of the driver: "active", "passive" or "off".
|
||||||
|
|
||||||
|
"active"
|
||||||
|
The driver is functional and in the `active mode
|
||||||
|
<Active Mode_>`_.
|
||||||
|
|
||||||
|
"passive"
|
||||||
|
The driver is functional and in the `passive mode
|
||||||
|
<Passive Mode_>`_.
|
||||||
|
|
||||||
|
"off"
|
||||||
|
The driver is not functional (it is not registered as a scaling
|
||||||
|
driver with the ``CPUFreq`` core).
|
||||||
|
|
||||||
|
This attribute can be written to in order to change the driver's
|
||||||
|
operation mode or to unregister it. The string written to it must be
|
||||||
|
one of the possible values of it and, if successful, the write will
|
||||||
|
cause the driver to switch over to the operation mode represented by
|
||||||
|
that string - or to be unregistered in the "off" case. [Actually,
|
||||||
|
switching over from the active mode to the passive mode or the other
|
||||||
|
way around causes the driver to be unregistered and registered again
|
||||||
|
with a different set of callbacks, so all of its settings (the global
|
||||||
|
as well as the per-policy ones) are then reset to their default
|
||||||
|
values, possibly depending on the target operation mode.]
|
||||||
|
|
||||||
|
That only is supported in some configurations, though (for example, if
|
||||||
|
the `HWP feature is enabled in the processor <Active Mode With HWP_>`_,
|
||||||
|
the operation mode of the driver cannot be changed), and if it is not
|
||||||
|
supported in the current configuration, writes to this attribute with
|
||||||
|
fail with an appropriate error.
|
||||||
|
|
||||||
|
Interpretation of Policy Attributes
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
|
The interpretation of some ``CPUFreq`` policy attributes described in
|
||||||
|
:doc:`cpufreq` is special with ``intel_pstate`` as the current scaling driver
|
||||||
|
and it generally depends on the driver's `operation mode <Operation Modes_>`_.
|
||||||
|
|
||||||
|
First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and
|
||||||
|
``scaling_cur_freq`` attributes are produced by applying a processor-specific
|
||||||
|
multiplier to the internal P-state representation used by ``intel_pstate``.
|
||||||
|
Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq``
|
||||||
|
attributes are capped by the frequency corresponding to the maximum P-state that
|
||||||
|
the driver is allowed to set.
|
||||||
|
|
||||||
|
If the ``no_turbo`` `global attribute <no_turbo_attr_>`_ is set, the driver is
|
||||||
|
not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq``
|
||||||
|
and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency.
|
||||||
|
Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and
|
||||||
|
``scaling_min_freq`` to go down to that value if they were above it before.
|
||||||
|
However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be
|
||||||
|
restored after unsetting ``no_turbo``, unless these attributes have been written
|
||||||
|
to after ``no_turbo`` was set.
|
||||||
|
|
||||||
|
If ``no_turbo`` is not set, the maximum possible value of ``scaling_max_freq``
|
||||||
|
and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state,
|
||||||
|
which also is the value of ``cpuinfo_max_freq`` in either case.
|
||||||
|
|
||||||
|
Next, the following policy attributes have special meaning if
|
||||||
|
``intel_pstate`` works in the `active mode <Active Mode_>`_:
|
||||||
|
|
||||||
|
``scaling_available_governors``
|
||||||
|
List of P-state selection algorithms provided by ``intel_pstate``.
|
||||||
|
|
||||||
|
``scaling_governor``
|
||||||
|
P-state selection algorithm provided by ``intel_pstate`` currently in
|
||||||
|
use with the given policy.
|
||||||
|
|
||||||
|
``scaling_cur_freq``
|
||||||
|
Frequency of the average P-state of the CPU represented by the given
|
||||||
|
policy for the time interval between the last two invocations of the
|
||||||
|
driver's utilization update callback by the CPU scheduler for that CPU.
|
||||||
|
|
||||||
|
The meaning of these attributes in the `passive mode <Passive Mode_>`_ is the
|
||||||
|
same as for other scaling drivers.
|
||||||
|
|
||||||
|
Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate``
|
||||||
|
depends on the operation mode of the driver. Namely, it is either
|
||||||
|
"intel_pstate" (in the `active mode <Active Mode_>`_) or "intel_cpufreq" (in the
|
||||||
|
`passive mode <Passive Mode_>`_).
|
||||||
|
|
||||||
|
Coordination of P-State Limits
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
``intel_pstate`` allows P-state limits to be set in two ways: with the help of
|
||||||
|
the ``max_perf_pct`` and ``min_perf_pct`` `global attributes
|
||||||
|
<Global Attributes_>`_ or via the ``scaling_max_freq`` and ``scaling_min_freq``
|
||||||
|
``CPUFreq`` policy attributes. The coordination between those limits is based
|
||||||
|
on the following rules, regardless of the current operation mode of the driver:
|
||||||
|
|
||||||
|
1. All CPUs are affected by the global limits (that is, none of them can be
|
||||||
|
requested to run faster than the global maximum and none of them can be
|
||||||
|
requested to run slower than the global minimum).
|
||||||
|
|
||||||
|
2. Each individual CPU is affected by its own per-policy limits (that is, it
|
||||||
|
cannot be requested to run faster than its own per-policy maximum and it
|
||||||
|
cannot be requested to run slower than its own per-policy minimum).
|
||||||
|
|
||||||
|
3. The global and per-policy limits can be set independently.
|
||||||
|
|
||||||
|
If the `HWP feature is enabled in the processor <Active Mode With HWP_>`_, the
|
||||||
|
resulting effective values are written into its registers whenever the limits
|
||||||
|
change in order to request its internal P-state selection logic to always set
|
||||||
|
P-states within these limits. Otherwise, the limits are taken into account by
|
||||||
|
scaling governors (in the `passive mode <Passive Mode_>`_) and by the driver
|
||||||
|
every time before setting a new P-state for a CPU.
|
||||||
|
|
||||||
|
Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument
|
||||||
|
is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed
|
||||||
|
at all and the only way to set the limits is by using the policy attributes.
|
||||||
|
|
||||||
|
|
||||||
|
Energy vs Performance Hints
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
If ``intel_pstate`` works in the `active mode with the HWP feature enabled
|
||||||
|
<Active Mode With HWP_>`_ in the processor, additional attributes are present
|
||||||
|
in every ``CPUFreq`` policy directory in ``sysfs``. They are intended to allow
|
||||||
|
user space to help ``intel_pstate`` to adjust the processor's internal P-state
|
||||||
|
selection logic by focusing it on performance or on energy-efficiency, or
|
||||||
|
somewhere between the two extremes:
|
||||||
|
|
||||||
|
``energy_performance_preference``
|
||||||
|
Current value of the energy vs performance hint for the given policy
|
||||||
|
(or the CPU represented by it).
|
||||||
|
|
||||||
|
The hint can be changed by writing to this attribute.
|
||||||
|
|
||||||
|
``energy_performance_available_preferences``
|
||||||
|
List of strings that can be written to the
|
||||||
|
``energy_performance_preference`` attribute.
|
||||||
|
|
||||||
|
They represent different energy vs performance hints and should be
|
||||||
|
self-explanatory, except that ``default`` represents whatever hint
|
||||||
|
value was set by the platform firmware.
|
||||||
|
|
||||||
|
Strings written to the ``energy_performance_preference`` attribute are
|
||||||
|
internally translated to integer values written to the processor's
|
||||||
|
Energy-Performance Preference (EPP) knob (if supported) or its
|
||||||
|
Energy-Performance Bias (EPB) knob.
|
||||||
|
|
||||||
|
[Note that tasks may by migrated from one CPU to another by the scheduler's
|
||||||
|
load-balancing algorithm and if different energy vs performance hints are
|
||||||
|
set for those CPUs, that may lead to undesirable outcomes. To avoid such
|
||||||
|
issues it is better to set the same energy vs performance hint for all CPUs
|
||||||
|
or to pin every task potentially sensitive to them to a specific CPU.]
|
||||||
|
|
||||||
|
.. _acpi-cpufreq:
|
||||||
|
|
||||||
|
``intel_pstate`` vs ``acpi-cpufreq``
|
||||||
|
====================================
|
||||||
|
|
||||||
|
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
||||||
|
provided by the platform firmware contain ``_PSS`` objects returning information
|
||||||
|
that can be used for CPU performance scaling (refer to the `ACPI specification`_
|
||||||
|
for details on the ``_PSS`` objects and the format of the information returned
|
||||||
|
by them).
|
||||||
|
|
||||||
|
The information returned by the ACPI ``_PSS`` objects is used by the
|
||||||
|
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
||||||
|
the ``acpi-cpufreq`` driver uses the same hardware CPU performance scaling
|
||||||
|
interface, but the set of P-states it can use is limited by the ``_PSS``
|
||||||
|
output.
|
||||||
|
|
||||||
|
On those systems each ``_PSS`` object returns a list of P-states supported by
|
||||||
|
the corresponding CPU which basically is a subset of the P-states range that can
|
||||||
|
be used by ``intel_pstate`` on the same system, with one exception: the whole
|
||||||
|
`turbo range <turbo_>`_ is represented by one item in it (the topmost one). By
|
||||||
|
convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz
|
||||||
|
than the frequency of the highest non-turbo P-state listed by it, but the
|
||||||
|
corresponding P-state representation (following the hardware specification)
|
||||||
|
returned for it matches the maximum supported turbo P-state (or is the
|
||||||
|
special value 255 meaning essentially "go as high as you can get").
|
||||||
|
|
||||||
|
The list of P-states returned by ``_PSS`` is reflected by the table of
|
||||||
|
available frequencies supplied by ``acpi-cpufreq`` to the ``CPUFreq`` core and
|
||||||
|
scaling governors and the minimum and maximum supported frequencies reported by
|
||||||
|
it come from that list as well. In particular, given the special representation
|
||||||
|
of the turbo range described above, this means that the maximum supported
|
||||||
|
frequency reported by ``acpi-cpufreq`` is higher by 1 MHz than the frequency
|
||||||
|
of the highest supported non-turbo P-state listed by ``_PSS`` which, of course,
|
||||||
|
affects decisions made by the scaling governors, except for ``powersave`` and
|
||||||
|
``performance``.
|
||||||
|
|
||||||
|
For example, if a given governor attempts to select a frequency proportional to
|
||||||
|
estimated CPU load and maps the load of 100% to the maximum supported frequency
|
||||||
|
(possibly multiplied by a constant), then it will tend to choose P-states below
|
||||||
|
the turbo threshold if ``acpi-cpufreq`` is used as the scaling driver, because
|
||||||
|
in that case the turbo range corresponds to a small fraction of the frequency
|
||||||
|
band it can use (1 MHz vs 1 GHz or more). In consequence, it will only go to
|
||||||
|
the turbo range for the highest loads and the other loads above 50% that might
|
||||||
|
benefit from running at turbo frequencies will be given non-turbo P-states
|
||||||
|
instead.
|
||||||
|
|
||||||
|
One more issue related to that may appear on systems supporting the
|
||||||
|
`Configurable TDP feature <turbo_>`_ allowing the platform firmware to set the
|
||||||
|
turbo threshold. Namely, if that is not coordinated with the lists of P-states
|
||||||
|
returned by ``_PSS`` properly, there may be more than one item corresponding to
|
||||||
|
a turbo P-state in those lists and there may be a problem with avoiding the
|
||||||
|
turbo range (if desirable or necessary). Usually, to avoid using turbo
|
||||||
|
P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed
|
||||||
|
by ``_PSS``, but that is not sufficient when there are other turbo P-states in
|
||||||
|
the list returned by it.
|
||||||
|
|
||||||
|
Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the
|
||||||
|
`passive mode <Passive Mode_>`_, except that the number of P-states it can set
|
||||||
|
is limited to the ones listed by the ACPI ``_PSS`` objects.
|
||||||
|
|
||||||
|
|
||||||
|
Kernel Command Line Options for ``intel_pstate``
|
||||||
|
================================================
|
||||||
|
|
||||||
|
Several kernel command line options can be used to pass early-configuration-time
|
||||||
|
parameters to ``intel_pstate`` in order to enforce specific behavior of it. All
|
||||||
|
of them have to be prepended with the ``intel_pstate=`` prefix.
|
||||||
|
|
||||||
|
``disable``
|
||||||
|
Do not register ``intel_pstate`` as the scaling driver even if the
|
||||||
|
processor is supported by it.
|
||||||
|
|
||||||
|
``passive``
|
||||||
|
Register ``intel_pstate`` in the `passive mode <Passive Mode_>`_ to
|
||||||
|
start with.
|
||||||
|
|
||||||
|
This option implies the ``no_hwp`` one described below.
|
||||||
|
|
||||||
|
``force``
|
||||||
|
Register ``intel_pstate`` as the scaling driver instead of
|
||||||
|
``acpi-cpufreq`` even if the latter is preferred on the given system.
|
||||||
|
|
||||||
|
This may prevent some platform features (such as thermal controls and
|
||||||
|
power capping) that rely on the availability of ACPI P-states
|
||||||
|
information from functioning as expected, so it should be used with
|
||||||
|
caution.
|
||||||
|
|
||||||
|
This option does not work with processors that are not supported by
|
||||||
|
``intel_pstate`` and on platforms where the ``pcc-cpufreq`` scaling
|
||||||
|
driver is used instead of ``acpi-cpufreq``.
|
||||||
|
|
||||||
|
``no_hwp``
|
||||||
|
Do not enable the `hardware-managed P-states (HWP) feature
|
||||||
|
<Active Mode With HWP_>`_ even if it is supported by the processor.
|
||||||
|
|
||||||
|
``hwp_only``
|
||||||
|
Register ``intel_pstate`` as the scaling driver only if the
|
||||||
|
`hardware-managed P-states (HWP) feature <Active Mode With HWP_>`_ is
|
||||||
|
supported by the processor.
|
||||||
|
|
||||||
|
``support_acpi_ppc``
|
||||||
|
Take ACPI ``_PPC`` performance limits into account.
|
||||||
|
|
||||||
|
If the preferred power management profile in the FADT (Fixed ACPI
|
||||||
|
Description Table) is set to "Enterprise Server" or "Performance
|
||||||
|
Server", the ACPI ``_PPC`` limits are taken into account by default
|
||||||
|
and this option has no effect.
|
||||||
|
|
||||||
|
``per_cpu_perf_limits``
|
||||||
|
Use per-logical-CPU P-State limits (see `Coordination of P-state
|
||||||
|
Limits`_ for details).
|
||||||
|
|
||||||
|
|
||||||
|
Diagnostics and Tuning
|
||||||
|
======================
|
||||||
|
|
||||||
|
Trace Events
|
||||||
|
------------
|
||||||
|
|
||||||
|
There are two static trace events that can be used for ``intel_pstate``
|
||||||
|
diagnostics. One of them is the ``cpu_frequency`` trace event generally used
|
||||||
|
by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific
|
||||||
|
to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if
|
||||||
|
it works in the `active mode <Active Mode_>`_.
|
||||||
|
|
||||||
|
The following sequence of shell commands can be used to enable them and see
|
||||||
|
their output (if the kernel is generally configured to support event tracing)::
|
||||||
|
|
||||||
|
# cd /sys/kernel/debug/tracing/
|
||||||
|
# echo 1 > events/power/pstate_sample/enable
|
||||||
|
# echo 1 > events/power/cpu_frequency/enable
|
||||||
|
# cat trace
|
||||||
|
gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476
|
||||||
|
cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
|
||||||
|
|
||||||
|
If ``intel_pstate`` works in the `passive mode <Passive Mode_>`_, the
|
||||||
|
``cpu_frequency`` trace event will be triggered either by the ``schedutil``
|
||||||
|
scaling governor (for the policies it is attached to), or by the ``CPUFreq``
|
||||||
|
core (for the policies with other scaling governors).
|
||||||
|
|
||||||
|
``ftrace``
|
||||||
|
----------
|
||||||
|
|
||||||
|
The ``ftrace`` interface can be used for low-level diagnostics of
|
||||||
|
``intel_pstate``. For example, to check how often the function to set a
|
||||||
|
P-state is called, the ``ftrace`` filter can be set to to
|
||||||
|
:c:func:`intel_pstate_set_pstate`::
|
||||||
|
|
||||||
|
# cd /sys/kernel/debug/tracing/
|
||||||
|
# cat available_filter_functions | grep -i pstate
|
||||||
|
intel_pstate_set_pstate
|
||||||
|
intel_pstate_cpu_init
|
||||||
|
...
|
||||||
|
# echo intel_pstate_set_pstate > set_ftrace_filter
|
||||||
|
# echo function > current_tracer
|
||||||
|
# cat trace | head -15
|
||||||
|
# tracer: function
|
||||||
|
#
|
||||||
|
# entries-in-buffer/entries-written: 80/80 #P:4
|
||||||
|
#
|
||||||
|
# _-----=> irqs-off
|
||||||
|
# / _----=> need-resched
|
||||||
|
# | / _---=> hardirq/softirq
|
||||||
|
# || / _--=> preempt-depth
|
||||||
|
# ||| / delay
|
||||||
|
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
|
||||||
|
# | | | |||| | |
|
||||||
|
Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
|
||||||
|
Tuning Interface in ``debugfs``
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
|
||||||
|
processors in the active mode <powersave_>`_ is based on a `PID controller`_
|
||||||
|
whose parameters were chosen to address a number of different use cases at the
|
||||||
|
same time. However, it still is possible to fine-tune it to a specific workload
|
||||||
|
and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
|
||||||
|
provided for this purpose. [Note that the ``pstate_snb`` directory will be
|
||||||
|
present only if the specific P-state selection algorithm matching the interface
|
||||||
|
in it actually is in use.]
|
||||||
|
|
||||||
|
The following files present in that directory can be used to modify the PID
|
||||||
|
controller parameters at run time:
|
||||||
|
|
||||||
|
| ``deadband``
|
||||||
|
| ``d_gain_pct``
|
||||||
|
| ``i_gain_pct``
|
||||||
|
| ``p_gain_pct``
|
||||||
|
| ``sample_rate_ms``
|
||||||
|
| ``setpoint``
|
||||||
|
|
||||||
|
Note, however, that achieving desirable results this way generally requires
|
||||||
|
expert-level understanding of the power vs performance tradeoff, so extra care
|
||||||
|
is recommended when attempting to do that.
|
||||||
|
|
||||||
|
|
||||||
|
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||||
|
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||||
|
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
||||||
|
.. _PID controller: https://en.wikipedia.org/wiki/PID_controller
|
@@ -8,7 +8,7 @@ RAS concepts
|
|||||||
************
|
************
|
||||||
|
|
||||||
Reliability, Availability and Serviceability (RAS) is a concept used on
|
Reliability, Availability and Serviceability (RAS) is a concept used on
|
||||||
servers meant to measure their robusteness.
|
servers meant to measure their robustness.
|
||||||
|
|
||||||
Reliability
|
Reliability
|
||||||
is the probability that a system will produce correct outputs.
|
is the probability that a system will produce correct outputs.
|
||||||
@@ -42,13 +42,13 @@ Among the monitoring measures, the most usual ones include:
|
|||||||
|
|
||||||
* CPU – detect errors at instruction execution and at L1/L2/L3 caches;
|
* CPU – detect errors at instruction execution and at L1/L2/L3 caches;
|
||||||
* Memory – add error correction logic (ECC) to detect and correct errors;
|
* Memory – add error correction logic (ECC) to detect and correct errors;
|
||||||
* I/O – add CRC checksums for tranfered data;
|
* I/O – add CRC checksums for transferred data;
|
||||||
* Storage – RAID, journal file systems, checksums,
|
* Storage – RAID, journal file systems, checksums,
|
||||||
Self-Monitoring, Analysis and Reporting Technology (SMART).
|
Self-Monitoring, Analysis and Reporting Technology (SMART).
|
||||||
|
|
||||||
By monitoring the number of occurrences of error detections, it is possible
|
By monitoring the number of occurrences of error detections, it is possible
|
||||||
to identify if the probability of hardware errors is increasing, and, on such
|
to identify if the probability of hardware errors is increasing, and, on such
|
||||||
case, do a preventive maintainance to replace a degrated component while
|
case, do a preventive maintenance to replace a degraded component while
|
||||||
those errors are correctable.
|
those errors are correctable.
|
||||||
|
|
||||||
Types of errors
|
Types of errors
|
||||||
@@ -121,7 +121,7 @@ using the ``dmidecode`` tool. For example, on a desktop machine, it shows::
|
|||||||
On the above example, a DDR4 SO-DIMM memory module is located at the
|
On the above example, a DDR4 SO-DIMM memory module is located at the
|
||||||
system's memory labeled as "BANK 0", as given by the *bank locator* field.
|
system's memory labeled as "BANK 0", as given by the *bank locator* field.
|
||||||
Please notice that, on such system, the *total width* is equal to the
|
Please notice that, on such system, the *total width* is equal to the
|
||||||
*data witdh*. It means that such memory module doesn't have error
|
*data width*. It means that such memory module doesn't have error
|
||||||
detection/correction mechanisms.
|
detection/correction mechanisms.
|
||||||
|
|
||||||
Unfortunately, not all systems use the same field to specify the memory
|
Unfortunately, not all systems use the same field to specify the memory
|
||||||
@@ -145,7 +145,7 @@ bank. On this example, from an older server, ``dmidecode`` shows::
|
|||||||
|
|
||||||
There, the DDR3 RDIMM memory module is located at the system's memory labeled
|
There, the DDR3 RDIMM memory module is located at the system's memory labeled
|
||||||
as "DIMM_A1", as given by the *locator* field. Please notice that this
|
as "DIMM_A1", as given by the *locator* field. Please notice that this
|
||||||
memory module has 64 bits of *data witdh* and 72 bits of *total width*. So,
|
memory module has 64 bits of *data width* and 72 bits of *total width*. So,
|
||||||
it has 8 extra bits to be used by error detection and correction mechanisms.
|
it has 8 extra bits to be used by error detection and correction mechanisms.
|
||||||
Such kind of memory is called Error-correcting code memory (ECC memory).
|
Such kind of memory is called Error-correcting code memory (ECC memory).
|
||||||
|
|
||||||
@@ -186,7 +186,7 @@ Architecture (MCA)\ [#f3]_.
|
|||||||
.. [#f1] Please notice that several memory controllers allow operation on a
|
.. [#f1] Please notice that several memory controllers allow operation on a
|
||||||
mode called "Lock-Step", where it groups two memory modules together,
|
mode called "Lock-Step", where it groups two memory modules together,
|
||||||
doing 128-bit reads/writes. That gives 16 bits for error correction, with
|
doing 128-bit reads/writes. That gives 16 bits for error correction, with
|
||||||
significatively improves the error correction mechanism, at the expense
|
significantly improves the error correction mechanism, at the expense
|
||||||
that, when an error happens, there's no way to know what memory module is
|
that, when an error happens, there's no way to know what memory module is
|
||||||
to blame. So, it has to blame both memory modules.
|
to blame. So, it has to blame both memory modules.
|
||||||
|
|
||||||
|
@@ -14,14 +14,17 @@ Contact
|
|||||||
The Linux kernel security team can be contacted by email at
|
The Linux kernel security team can be contacted by email at
|
||||||
<security@kernel.org>. This is a private list of security officers
|
<security@kernel.org>. This is a private list of security officers
|
||||||
who will help verify the bug report and develop and release a fix.
|
who will help verify the bug report and develop and release a fix.
|
||||||
It is possible that the security team will bring in extra help from
|
If you already have a fix, please include it with your report, as
|
||||||
area maintainers to understand and fix the security vulnerability.
|
that can speed up the process considerably. It is possible that the
|
||||||
|
security team will bring in extra help from area maintainers to
|
||||||
|
understand and fix the security vulnerability.
|
||||||
|
|
||||||
As it is with any bug, the more information provided the easier it
|
As it is with any bug, the more information provided the easier it
|
||||||
will be to diagnose and fix. Please review the procedure outlined in
|
will be to diagnose and fix. Please review the procedure outlined in
|
||||||
admin-guide/reporting-bugs.rst if you are unclear about what information is helpful.
|
admin-guide/reporting-bugs.rst if you are unclear about what
|
||||||
Any exploit code is very helpful and will not be released without
|
information is helpful. Any exploit code is very helpful and will not
|
||||||
consent from the reporter unless it has already been made public.
|
be released without consent from the reporter unless it has already been
|
||||||
|
made public.
|
||||||
|
|
||||||
Disclosure
|
Disclosure
|
||||||
----------
|
----------
|
||||||
@@ -39,6 +42,32 @@ disclosure is from immediate (esp. if it's already publicly known)
|
|||||||
to a few weeks. As a basic default policy, we expect report date to
|
to a few weeks. As a basic default policy, we expect report date to
|
||||||
disclosure date to be on the order of 7 days.
|
disclosure date to be on the order of 7 days.
|
||||||
|
|
||||||
|
Coordination
|
||||||
|
------------
|
||||||
|
|
||||||
|
Fixes for sensitive bugs, such as those that might lead to privilege
|
||||||
|
escalations, may need to be coordinated with the private
|
||||||
|
<linux-distros@vs.openwall.org> mailing list so that distribution vendors
|
||||||
|
are well prepared to issue a fixed kernel upon public disclosure of the
|
||||||
|
upstream fix. Distros will need some time to test the proposed patch and
|
||||||
|
will generally request at least a few days of embargo, and vendor update
|
||||||
|
publication prefers to happen Tuesday through Thursday. When appropriate,
|
||||||
|
the security team can assist with this coordination, or the reporter can
|
||||||
|
include linux-distros from the start. In this case, remember to prefix
|
||||||
|
the email Subject line with "[vs]" as described in the linux-distros wiki:
|
||||||
|
<http://oss-security.openwall.org/wiki/mailing-lists/distros#how-to-use-the-lists>
|
||||||
|
|
||||||
|
CVE assignment
|
||||||
|
--------------
|
||||||
|
|
||||||
|
The security team does not normally assign CVEs, nor do we require them
|
||||||
|
for reports or fixes, as this can needlessly complicate the process and
|
||||||
|
may delay the bug handling. If a reporter wishes to have a CVE identifier
|
||||||
|
assigned ahead of public disclosure, they will need to contact the private
|
||||||
|
linux-distros list, described above. When such a CVE identifier is known
|
||||||
|
before a patch is provided, it is desirable to mention it in the commit
|
||||||
|
message, though.
|
||||||
|
|
||||||
Non-disclosure agreements
|
Non-disclosure agreements
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
|
@@ -212,7 +212,8 @@ I hit SysRq, but nothing seems to happen, what's wrong?
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
There are some keyboards that produce a different keycode for SysRq than the
|
There are some keyboards that produce a different keycode for SysRq than the
|
||||||
pre-defined value of 99 (see ``KEY_SYSRQ`` in ``include/linux/input.h``), or
|
pre-defined value of 99
|
||||||
|
(see ``KEY_SYSRQ`` in ``include/uapi/linux/input-event-codes.h``), or
|
||||||
which don't have a SysRq key at all. In these cases, run ``showkey -s`` to find
|
which don't have a SysRq key at all. In these cases, run ``showkey -s`` to find
|
||||||
an appropriate scancode sequence, and use ``setkeycodes <sequence> 99`` to map
|
an appropriate scancode sequence, and use ``setkeycodes <sequence> 99`` to map
|
||||||
this sequence to the usual SysRq code (e.g., ``setkeycodes e05b 99``). It's
|
this sequence to the usual SysRq code (e.g., ``setkeycodes e05b 99``). It's
|
||||||
|
@@ -48,7 +48,7 @@ Note that not all combinations are supported - only values 0 through 5.
|
|||||||
For example, the following will turn on the warnings, but without
|
For example, the following will turn on the warnings, but without
|
||||||
fixing up or sending SIGBUS signals:
|
fixing up or sending SIGBUS signals:
|
||||||
|
|
||||||
echo 1 > /proc/sys/debug/alignment
|
echo 1 > /proc/cpu/alignment
|
||||||
|
|
||||||
You can also read the content of the same file to get statistical
|
You can also read the content of the same file to get statistical
|
||||||
information on unaligned access occurrences plus the current mode of
|
information on unaligned access occurrences plus the current mode of
|
||||||
|
30
Documentation/arm/stm32/stm32h743-overview.txt
Normal file
30
Documentation/arm/stm32/stm32h743-overview.txt
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
STM32H743 Overview
|
||||||
|
==================
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
------------
|
||||||
|
The STM32H743 is a Cortex-M7 MCU aimed at various applications.
|
||||||
|
It features:
|
||||||
|
- Cortex-M7 core running up to @400MHz
|
||||||
|
- 2MB internal flash, 1MBytes internal RAM
|
||||||
|
- FMC controller to connect SDRAM, NOR and NAND memories
|
||||||
|
- Dual mode QSPI
|
||||||
|
- SD/MMC/SDIO support
|
||||||
|
- Ethernet controller
|
||||||
|
- USB OTFG FS & HS controllers
|
||||||
|
- I2C, SPI, CAN busses support
|
||||||
|
- Several 16 & 32 bits general purpose timers
|
||||||
|
- Serial Audio interface
|
||||||
|
- LCD controller
|
||||||
|
- HDMI-CEC
|
||||||
|
- SPDIFRX
|
||||||
|
- DFSDM
|
||||||
|
|
||||||
|
Resources
|
||||||
|
---------
|
||||||
|
Datasheet and reference manual are publicly available on ST website:
|
||||||
|
- http://www.st.com/en/microcontrollers/stm32h7x3.html?querycriteria=productId=LN2033
|
||||||
|
|
||||||
|
Document Author
|
||||||
|
---------------
|
||||||
|
Alexandre Torgue <alexandre.torgue@st.com>
|
@@ -169,6 +169,18 @@ infrastructure:
|
|||||||
as available on the CPU where it is fetched and is not a system
|
as available on the CPU where it is fetched and is not a system
|
||||||
wide safe value.
|
wide safe value.
|
||||||
|
|
||||||
|
4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
|
||||||
|
|
||||||
|
x--------------------------------------------------x
|
||||||
|
| Name | bits | visible |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| LRCPC | [23-20] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| FCMA | [19-16] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| JSCVT | [15-12] | y |
|
||||||
|
x--------------------------------------------------x
|
||||||
|
|
||||||
Appendix I: Example
|
Appendix I: Example
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
|
@@ -54,6 +54,7 @@ stable kernels.
|
|||||||
| ARM | Cortex-A57 | #852523 | N/A |
|
| ARM | Cortex-A57 | #852523 | N/A |
|
||||||
| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
|
| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
|
||||||
| ARM | Cortex-A72 | #853709 | N/A |
|
| ARM | Cortex-A72 | #853709 | N/A |
|
||||||
|
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
|
||||||
| ARM | MMU-500 | #841119,#826419 | N/A |
|
| ARM | MMU-500 | #841119,#826419 | N/A |
|
||||||
| | | | |
|
| | | | |
|
||||||
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
||||||
|
@@ -11,24 +11,56 @@ in AArch64 Linux.
|
|||||||
The kernel configures the translation tables so that translations made
|
The kernel configures the translation tables so that translations made
|
||||||
via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
|
via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
|
||||||
the virtual address ignored by the translation hardware. This frees up
|
the virtual address ignored by the translation hardware. This frees up
|
||||||
this byte for application use, with the following caveats:
|
this byte for application use.
|
||||||
|
|
||||||
(1) The kernel requires that all user addresses passed to EL1
|
|
||||||
are tagged with tag 0x00. This means that any syscall
|
|
||||||
parameters containing user virtual addresses *must* have
|
|
||||||
their top byte cleared before trapping to the kernel.
|
|
||||||
|
|
||||||
(2) Non-zero tags are not preserved when delivering signals.
|
Passing tagged addresses to the kernel
|
||||||
This means that signal handlers in applications making use
|
--------------------------------------
|
||||||
of tags cannot rely on the tag information for user virtual
|
|
||||||
addresses being maintained for fields inside siginfo_t.
|
|
||||||
One exception to this rule is for signals raised in response
|
|
||||||
to watchpoint debug exceptions, where the tag information
|
|
||||||
will be preserved.
|
|
||||||
|
|
||||||
(3) Special care should be taken when using tagged pointers,
|
All interpretation of userspace memory addresses by the kernel assumes
|
||||||
since it is likely that C compilers will not hazard two
|
an address tag of 0x00.
|
||||||
virtual addresses differing only in the upper byte.
|
|
||||||
|
This includes, but is not limited to, addresses found in:
|
||||||
|
|
||||||
|
- pointer arguments to system calls, including pointers in structures
|
||||||
|
passed to system calls,
|
||||||
|
|
||||||
|
- the stack pointer (sp), e.g. when interpreting it to deliver a
|
||||||
|
signal,
|
||||||
|
|
||||||
|
- the frame pointer (x29) and frame records, e.g. when interpreting
|
||||||
|
them to generate a backtrace or call graph.
|
||||||
|
|
||||||
|
Using non-zero address tags in any of these locations may result in an
|
||||||
|
error code being returned, a (fatal) signal being raised, or other modes
|
||||||
|
of failure.
|
||||||
|
|
||||||
|
For these reasons, passing non-zero address tags to the kernel via
|
||||||
|
system calls is forbidden, and using a non-zero address tag for sp is
|
||||||
|
strongly discouraged.
|
||||||
|
|
||||||
|
Programs maintaining a frame pointer and frame records that use non-zero
|
||||||
|
address tags may suffer impaired or inaccurate debug and profiling
|
||||||
|
visibility.
|
||||||
|
|
||||||
|
|
||||||
|
Preserving tags
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Non-zero tags are not preserved when delivering signals. This means that
|
||||||
|
signal handlers in applications making use of tags cannot rely on the
|
||||||
|
tag information for user virtual addresses being maintained for fields
|
||||||
|
inside siginfo_t. One exception to this rule is for signals raised in
|
||||||
|
response to watchpoint debug exceptions, where the tag information will
|
||||||
|
be preserved.
|
||||||
|
|
||||||
The architecture prevents the use of a tagged PC, so the upper byte will
|
The architecture prevents the use of a tagged PC, so the upper byte will
|
||||||
be set to a sign-extension of bit 55 on exception return.
|
be set to a sign-extension of bit 55 on exception return.
|
||||||
|
|
||||||
|
|
||||||
|
Other considerations
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Special care should be taken when using tagged pointers, since it is
|
||||||
|
likely that C compilers will not hazard two virtual addresses differing
|
||||||
|
only in the upper byte.
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
00-INDEX
|
00-INDEX
|
||||||
- This file
|
- This file
|
||||||
|
bfq-iosched.txt
|
||||||
|
- BFQ IO scheduler and its tunables
|
||||||
biodoc.txt
|
biodoc.txt
|
||||||
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
- Notes on the Generic Block Layer Rewrite in Linux 2.5
|
||||||
biovecs.txt
|
biovecs.txt
|
||||||
|
546
Documentation/block/bfq-iosched.txt
Normal file
546
Documentation/block/bfq-iosched.txt
Normal file
@@ -0,0 +1,546 @@
|
|||||||
|
BFQ (Budget Fair Queueing)
|
||||||
|
==========================
|
||||||
|
|
||||||
|
BFQ is a proportional-share I/O scheduler, with some extra
|
||||||
|
low-latency capabilities. In addition to cgroups support (blkio or io
|
||||||
|
controllers), BFQ's main features are:
|
||||||
|
- BFQ guarantees a high system and application responsiveness, and a
|
||||||
|
low latency for time-sensitive applications, such as audio or video
|
||||||
|
players;
|
||||||
|
- BFQ distributes bandwidth, and not just time, among processes or
|
||||||
|
groups (switching back to time distribution when needed to keep
|
||||||
|
throughput high).
|
||||||
|
|
||||||
|
In its default configuration, BFQ privileges latency over
|
||||||
|
throughput. So, when needed for achieving a lower latency, BFQ builds
|
||||||
|
schedules that may lead to a lower throughput. If your main or only
|
||||||
|
goal, for a given device, is to achieve the maximum-possible
|
||||||
|
throughput at all times, then do switch off all low-latency heuristics
|
||||||
|
for that device, by setting low_latency to 0. Full details in Section 3.
|
||||||
|
|
||||||
|
On average CPUs, the current version of BFQ can handle devices
|
||||||
|
performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
|
||||||
|
reference, 30-50 KIOPS correspond to very high bandwidths with
|
||||||
|
sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
|
||||||
|
to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
|
||||||
|
multi-queue devices.
|
||||||
|
|
||||||
|
The table of contents follow. Impatients can just jump to Section 3.
|
||||||
|
|
||||||
|
CONTENTS
|
||||||
|
|
||||||
|
1. When may BFQ be useful?
|
||||||
|
1-1 Personal systems
|
||||||
|
1-2 Server systems
|
||||||
|
2. How does BFQ work?
|
||||||
|
3. What are BFQ's tunable?
|
||||||
|
4. BFQ group scheduling
|
||||||
|
4-1 Service guarantees provided
|
||||||
|
4-2 Interface
|
||||||
|
|
||||||
|
1. When may BFQ be useful?
|
||||||
|
==========================
|
||||||
|
|
||||||
|
BFQ provides the following benefits on personal and server systems.
|
||||||
|
|
||||||
|
1-1 Personal systems
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Low latency for interactive applications
|
||||||
|
|
||||||
|
Regardless of the actual background workload, BFQ guarantees that, for
|
||||||
|
interactive tasks, the storage device is virtually as responsive as if
|
||||||
|
it was idle. For example, even if one or more of the following
|
||||||
|
background workloads are being executed:
|
||||||
|
- one or more large files are being read, written or copied,
|
||||||
|
- a tree of source files is being compiled,
|
||||||
|
- one or more virtual machines are performing I/O,
|
||||||
|
- a software update is in progress,
|
||||||
|
- indexing daemons are scanning filesystems and updating their
|
||||||
|
databases,
|
||||||
|
starting an application or loading a file from within an application
|
||||||
|
takes about the same time as if the storage device was idle. As a
|
||||||
|
comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
|
||||||
|
applications experience high latencies, or even become unresponsive
|
||||||
|
until the background workload terminates (also on SSDs).
|
||||||
|
|
||||||
|
Low latency for soft real-time applications
|
||||||
|
|
||||||
|
Also soft real-time applications, such as audio and video
|
||||||
|
players/streamers, enjoy a low latency and a low drop rate, regardless
|
||||||
|
of the background I/O workload. As a consequence, these applications
|
||||||
|
do not suffer from almost any glitch due to the background workload.
|
||||||
|
|
||||||
|
Higher speed for code-development tasks
|
||||||
|
|
||||||
|
If some additional workload happens to be executed in parallel, then
|
||||||
|
BFQ executes the I/O-related components of typical code-development
|
||||||
|
tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
|
||||||
|
NOOP or DEADLINE.
|
||||||
|
|
||||||
|
High throughput
|
||||||
|
|
||||||
|
On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
|
||||||
|
up to 150% higher throughput than DEADLINE and NOOP, with all the
|
||||||
|
sequential workloads considered in our tests. With random workloads,
|
||||||
|
and with all the workloads on flash-based devices, BFQ achieves,
|
||||||
|
instead, about the same throughput as the other schedulers.
|
||||||
|
|
||||||
|
Strong fairness, bandwidth and delay guarantees
|
||||||
|
|
||||||
|
BFQ distributes the device throughput, and not just the device time,
|
||||||
|
among I/O-bound applications in proportion their weights, with any
|
||||||
|
workload and regardless of the device parameters. From these bandwidth
|
||||||
|
guarantees, it is possible to compute tight per-I/O-request delay
|
||||||
|
guarantees by a simple formula. If not configured for strict service
|
||||||
|
guarantees, BFQ switches to time-based resource sharing (only) for
|
||||||
|
applications that would otherwise cause a throughput loss.
|
||||||
|
|
||||||
|
1-2 Server systems
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Most benefits for server systems follow from the same service
|
||||||
|
properties as above. In particular, regardless of whether additional,
|
||||||
|
possibly heavy workloads are being served, BFQ guarantees:
|
||||||
|
|
||||||
|
. audio and video-streaming with zero or very low jitter and drop
|
||||||
|
rate;
|
||||||
|
|
||||||
|
. fast retrieval of WEB pages and embedded objects;
|
||||||
|
|
||||||
|
. real-time recording of data in live-dumping applications (e.g.,
|
||||||
|
packet logging);
|
||||||
|
|
||||||
|
. responsiveness in local and remote access to a server.
|
||||||
|
|
||||||
|
|
||||||
|
2. How does BFQ work?
|
||||||
|
=====================
|
||||||
|
|
||||||
|
BFQ is a proportional-share I/O scheduler, whose general structure,
|
||||||
|
plus a lot of code, are borrowed from CFQ.
|
||||||
|
|
||||||
|
- Each process doing I/O on a device is associated with a weight and a
|
||||||
|
(bfq_)queue.
|
||||||
|
|
||||||
|
- BFQ grants exclusive access to the device, for a while, to one queue
|
||||||
|
(process) at a time, and implements this service model by
|
||||||
|
associating every queue with a budget, measured in number of
|
||||||
|
sectors.
|
||||||
|
|
||||||
|
- After a queue is granted access to the device, the budget of the
|
||||||
|
queue is decremented, on each request dispatch, by the size of the
|
||||||
|
request.
|
||||||
|
|
||||||
|
- The in-service queue is expired, i.e., its service is suspended,
|
||||||
|
only if one of the following events occurs: 1) the queue finishes
|
||||||
|
its budget, 2) the queue empties, 3) a "budget timeout" fires.
|
||||||
|
|
||||||
|
- The budget timeout prevents processes doing random I/O from
|
||||||
|
holding the device for too long and dramatically reducing
|
||||||
|
throughput.
|
||||||
|
|
||||||
|
- Actually, as in CFQ, a queue associated with a process issuing
|
||||||
|
sync requests may not be expired immediately when it empties. In
|
||||||
|
contrast, BFQ may idle the device for a short time interval,
|
||||||
|
giving the process the chance to go on being served if it issues
|
||||||
|
a new request in time. Device idling typically boosts the
|
||||||
|
throughput on rotational devices, if processes do synchronous
|
||||||
|
and sequential I/O. In addition, under BFQ, device idling is
|
||||||
|
also instrumental in guaranteeing the desired throughput
|
||||||
|
fraction to processes issuing sync requests (see the description
|
||||||
|
of the slice_idle tunable in this document, or [1, 2], for more
|
||||||
|
details).
|
||||||
|
|
||||||
|
- With respect to idling for service guarantees, if several
|
||||||
|
processes are competing for the device at the same time, but
|
||||||
|
all processes (and groups, after the following commit) have
|
||||||
|
the same weight, then BFQ guarantees the expected throughput
|
||||||
|
distribution without ever idling the device. Throughput is
|
||||||
|
thus as high as possible in this common scenario.
|
||||||
|
|
||||||
|
- If low-latency mode is enabled (default configuration), BFQ
|
||||||
|
executes some special heuristics to detect interactive and soft
|
||||||
|
real-time applications (e.g., video or audio players/streamers),
|
||||||
|
and to reduce their latency. The most important action taken to
|
||||||
|
achieve this goal is to give to the queues associated with these
|
||||||
|
applications more than their fair share of the device
|
||||||
|
throughput. For brevity, we call just "weight-raising" the whole
|
||||||
|
sets of actions taken by BFQ to privilege these queues. In
|
||||||
|
particular, BFQ provides a milder form of weight-raising for
|
||||||
|
interactive applications, and a stronger form for soft real-time
|
||||||
|
applications.
|
||||||
|
|
||||||
|
- BFQ automatically deactivates idling for queues born in a burst of
|
||||||
|
queue creations. In fact, these queues are usually associated with
|
||||||
|
the processes of applications and services that benefit mostly
|
||||||
|
from a high throughput. Examples are systemd during boot, or git
|
||||||
|
grep.
|
||||||
|
|
||||||
|
- As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
|
||||||
|
performing random I/O that becomes mostly sequential if
|
||||||
|
merged. Differently from CFQ, BFQ achieves this goal with a more
|
||||||
|
reactive mechanism, called Early Queue Merge (EQM). EQM is so
|
||||||
|
responsive in detecting interleaved I/O (cooperating processes),
|
||||||
|
that it enables BFQ to achieve a high throughput, by queue
|
||||||
|
merging, even for queues for which CFQ needs a different
|
||||||
|
mechanism, preemption, to get a high throughput. As such EQM is a
|
||||||
|
unified mechanism to achieve a high throughput with interleaved
|
||||||
|
I/O.
|
||||||
|
|
||||||
|
- Queues are scheduled according to a variant of WF2Q+, named
|
||||||
|
B-WF2Q+, and implemented using an augmented rb-tree to preserve an
|
||||||
|
O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
|
||||||
|
also ready for hierarchical scheduling. However, for a cleaner
|
||||||
|
logical breakdown, the code that enables and completes
|
||||||
|
hierarchical support is provided in the next commit, which focuses
|
||||||
|
exactly on this feature.
|
||||||
|
|
||||||
|
- B-WF2Q+ guarantees a tight deviation with respect to an ideal,
|
||||||
|
perfectly fair, and smooth service. In particular, B-WF2Q+
|
||||||
|
guarantees that each queue receives a fraction of the device
|
||||||
|
throughput proportional to its weight, even if the throughput
|
||||||
|
fluctuates, and regardless of: the device parameters, the current
|
||||||
|
workload and the budgets assigned to the queue.
|
||||||
|
|
||||||
|
- The last, budget-independence, property (although probably
|
||||||
|
counterintuitive in the first place) is definitely beneficial, for
|
||||||
|
the following reasons:
|
||||||
|
|
||||||
|
- First, with any proportional-share scheduler, the maximum
|
||||||
|
deviation with respect to an ideal service is proportional to
|
||||||
|
the maximum budget (slice) assigned to queues. As a consequence,
|
||||||
|
BFQ can keep this deviation tight not only because of the
|
||||||
|
accurate service of B-WF2Q+, but also because BFQ *does not*
|
||||||
|
need to assign a larger budget to a queue to let the queue
|
||||||
|
receive a higher fraction of the device throughput.
|
||||||
|
|
||||||
|
- Second, BFQ is free to choose, for every process (queue), the
|
||||||
|
budget that best fits the needs of the process, or best
|
||||||
|
leverages the I/O pattern of the process. In particular, BFQ
|
||||||
|
updates queue budgets with a simple feedback-loop algorithm that
|
||||||
|
allows a high throughput to be achieved, while still providing
|
||||||
|
tight latency guarantees to time-sensitive applications. When
|
||||||
|
the in-service queue expires, this algorithm computes the next
|
||||||
|
budget of the queue so as to:
|
||||||
|
|
||||||
|
- Let large budgets be eventually assigned to the queues
|
||||||
|
associated with I/O-bound applications performing sequential
|
||||||
|
I/O: in fact, the longer these applications are served once
|
||||||
|
got access to the device, the higher the throughput is.
|
||||||
|
|
||||||
|
- Let small budgets be eventually assigned to the queues
|
||||||
|
associated with time-sensitive applications (which typically
|
||||||
|
perform sporadic and short I/O), because, the smaller the
|
||||||
|
budget assigned to a queue waiting for service is, the sooner
|
||||||
|
B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
|
||||||
|
|
||||||
|
- If several processes are competing for the device at the same time,
|
||||||
|
but all processes and groups have the same weight, then BFQ
|
||||||
|
guarantees the expected throughput distribution without ever idling
|
||||||
|
the device. It uses preemption instead. Throughput is then much
|
||||||
|
higher in this common scenario.
|
||||||
|
|
||||||
|
- ioprio classes are served in strict priority order, i.e.,
|
||||||
|
lower-priority queues are not served as long as there are
|
||||||
|
higher-priority queues. Among queues in the same class, the
|
||||||
|
bandwidth is distributed in proportion to the weight of each
|
||||||
|
queue. A very thin extra bandwidth is however guaranteed to
|
||||||
|
the Idle class, to prevent it from starving.
|
||||||
|
|
||||||
|
|
||||||
|
3. What are BFQ's tunable?
|
||||||
|
==========================
|
||||||
|
|
||||||
|
The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
|
||||||
|
fifo_expire_sync below are the same as in CFQ. Their description is
|
||||||
|
just copied from that for CFQ. Some considerations in the description
|
||||||
|
of slice_idle are copied from CFQ too.
|
||||||
|
|
||||||
|
per-process ioprio and weight
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
Unless the cgroups interface is used (see "4. BFQ group scheduling"),
|
||||||
|
weights can be assigned to processes only indirectly, through I/O
|
||||||
|
priorities, and according to the relation:
|
||||||
|
weight = (IOPRIO_BE_NR - ioprio) * 10.
|
||||||
|
|
||||||
|
Beware that, if low-latency is set, then BFQ automatically raises the
|
||||||
|
weight of the queues associated with interactive and soft real-time
|
||||||
|
applications. Unset this tunable if you need/want to control weights.
|
||||||
|
|
||||||
|
slice_idle
|
||||||
|
----------
|
||||||
|
|
||||||
|
This parameter specifies how long BFQ should idle for next I/O
|
||||||
|
request, when certain sync BFQ queues become empty. By default
|
||||||
|
slice_idle is a non-zero value. Idling has a double purpose: boosting
|
||||||
|
throughput and making sure that the desired throughput distribution is
|
||||||
|
respected (see the description of how BFQ works, and, if needed, the
|
||||||
|
papers referred there).
|
||||||
|
|
||||||
|
As for throughput, idling can be very helpful on highly seeky media
|
||||||
|
like single spindle SATA/SAS disks where we can cut down on overall
|
||||||
|
number of seeks and see improved throughput.
|
||||||
|
|
||||||
|
Setting slice_idle to 0 will remove all the idling on queues and one
|
||||||
|
should see an overall improved throughput on faster storage devices
|
||||||
|
like multiple SATA/SAS disks in hardware RAID configuration.
|
||||||
|
|
||||||
|
So depending on storage and workload, it might be useful to set
|
||||||
|
slice_idle=0. In general for SATA/SAS disks and software RAID of
|
||||||
|
SATA/SAS disks keeping slice_idle enabled should be useful. For any
|
||||||
|
configurations where there are multiple spindles behind single LUN
|
||||||
|
(Host based hardware RAID controller or for storage arrays), setting
|
||||||
|
slice_idle=0 might end up in better throughput and acceptable
|
||||||
|
latencies.
|
||||||
|
|
||||||
|
Idling is however necessary to have service guarantees enforced in
|
||||||
|
case of differentiated weights or differentiated I/O-request lengths.
|
||||||
|
To see why, suppose that a given BFQ queue A must get several I/O
|
||||||
|
requests served for each request served for another queue B. Idling
|
||||||
|
ensures that, if A makes a new I/O request slightly after becoming
|
||||||
|
empty, then no request of B is dispatched in the middle, and thus A
|
||||||
|
does not lose the possibility to get more than one request dispatched
|
||||||
|
before the next request of B is dispatched. Note that idling
|
||||||
|
guarantees the desired differentiated treatment of queues only in
|
||||||
|
terms of I/O-request dispatches. To guarantee that the actual service
|
||||||
|
order then corresponds to the dispatch order, the strict_guarantees
|
||||||
|
tunable must be set too.
|
||||||
|
|
||||||
|
There is an important flipside for idling: apart from the above cases
|
||||||
|
where it is beneficial also for throughput, idling can severely impact
|
||||||
|
throughput. One important case is random workload. Because of this
|
||||||
|
issue, BFQ tends to avoid idling as much as possible, when it is not
|
||||||
|
beneficial also for throughput. As a consequence of this behavior, and
|
||||||
|
of further issues described for the strict_guarantees tunable,
|
||||||
|
short-term service guarantees may be occasionally violated. And, in
|
||||||
|
some cases, these guarantees may be more important than guaranteeing
|
||||||
|
maximum throughput. For example, in video playing/streaming, a very
|
||||||
|
low drop rate may be more important than maximum throughput. In these
|
||||||
|
cases, consider setting the strict_guarantees parameter.
|
||||||
|
|
||||||
|
strict_guarantees
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
If this parameter is set (default: unset), then BFQ
|
||||||
|
|
||||||
|
- always performs idling when the in-service queue becomes empty;
|
||||||
|
|
||||||
|
- forces the device to serve one I/O request at a time, by dispatching a
|
||||||
|
new request only if there is no outstanding request.
|
||||||
|
|
||||||
|
In the presence of differentiated weights or I/O-request sizes, both
|
||||||
|
the above conditions are needed to guarantee that every BFQ queue
|
||||||
|
receives its allotted share of the bandwidth. The first condition is
|
||||||
|
needed for the reasons explained in the description of the slice_idle
|
||||||
|
tunable. The second condition is needed because all modern storage
|
||||||
|
devices reorder internally-queued requests, which may trivially break
|
||||||
|
the service guarantees enforced by the I/O scheduler.
|
||||||
|
|
||||||
|
Setting strict_guarantees may evidently affect throughput.
|
||||||
|
|
||||||
|
back_seek_max
|
||||||
|
-------------
|
||||||
|
|
||||||
|
This specifies, given in Kbytes, the maximum "distance" for backward seeking.
|
||||||
|
The distance is the amount of space from the current head location to the
|
||||||
|
sectors that are backward in terms of distance.
|
||||||
|
|
||||||
|
This parameter allows the scheduler to anticipate requests in the "backward"
|
||||||
|
direction and consider them as being the "next" if they are within this
|
||||||
|
distance from the current head location.
|
||||||
|
|
||||||
|
back_seek_penalty
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
This parameter is used to compute the cost of backward seeking. If the
|
||||||
|
backward distance of request is just 1/back_seek_penalty from a "front"
|
||||||
|
request, then the seeking cost of two requests is considered equivalent.
|
||||||
|
|
||||||
|
So scheduler will not bias toward one or the other request (otherwise scheduler
|
||||||
|
will bias toward front request). Default value of back_seek_penalty is 2.
|
||||||
|
|
||||||
|
fifo_expire_async
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
This parameter is used to set the timeout of asynchronous requests. Default
|
||||||
|
value of this is 248ms.
|
||||||
|
|
||||||
|
fifo_expire_sync
|
||||||
|
----------------
|
||||||
|
|
||||||
|
This parameter is used to set the timeout of synchronous requests. Default
|
||||||
|
value of this is 124ms. In case to favor synchronous requests over asynchronous
|
||||||
|
one, this value should be decreased relative to fifo_expire_async.
|
||||||
|
|
||||||
|
low_latency
|
||||||
|
-----------
|
||||||
|
|
||||||
|
This parameter is used to enable/disable BFQ's low latency mode. By
|
||||||
|
default, low latency mode is enabled. If enabled, interactive and soft
|
||||||
|
real-time applications are privileged and experience a lower latency,
|
||||||
|
as explained in more detail in the description of how BFQ works.
|
||||||
|
|
||||||
|
DISABLE this mode if you need full control on bandwidth
|
||||||
|
distribution. In fact, if it is enabled, then BFQ automatically
|
||||||
|
increases the bandwidth share of privileged applications, as the main
|
||||||
|
means to guarantee a lower latency to them.
|
||||||
|
|
||||||
|
In addition, as already highlighted at the beginning of this document,
|
||||||
|
DISABLE this mode if your only goal is to achieve a high throughput.
|
||||||
|
In fact, privileging the I/O of some application over the rest may
|
||||||
|
entail a lower throughput. To achieve the highest-possible throughput
|
||||||
|
on a non-rotational device, setting slice_idle to 0 may be needed too
|
||||||
|
(at the cost of giving up any strong guarantee on fairness and low
|
||||||
|
latency).
|
||||||
|
|
||||||
|
timeout_sync
|
||||||
|
------------
|
||||||
|
|
||||||
|
Maximum amount of device time that can be given to a task (queue) once
|
||||||
|
it has been selected for service. On devices with costly seeks,
|
||||||
|
increasing this time usually increases maximum throughput. On the
|
||||||
|
opposite end, increasing this time coarsens the granularity of the
|
||||||
|
short-term bandwidth and latency guarantees, especially if the
|
||||||
|
following parameter is set to zero.
|
||||||
|
|
||||||
|
max_budget
|
||||||
|
----------
|
||||||
|
|
||||||
|
Maximum amount of service, measured in sectors, that can be provided
|
||||||
|
to a BFQ queue once it is set in service (of course within the limits
|
||||||
|
of the above timeout). According to what said in the description of
|
||||||
|
the algorithm, larger values increase the throughput in proportion to
|
||||||
|
the percentage of sequential I/O requests issued. The price of larger
|
||||||
|
values is that they coarsen the granularity of short-term bandwidth
|
||||||
|
and latency guarantees.
|
||||||
|
|
||||||
|
The default value is 0, which enables auto-tuning: BFQ sets max_budget
|
||||||
|
to the maximum number of sectors that can be served during
|
||||||
|
timeout_sync, according to the estimated peak rate.
|
||||||
|
|
||||||
|
weights
|
||||||
|
-------
|
||||||
|
|
||||||
|
Read-only parameter, used to show the weights of the currently active
|
||||||
|
BFQ queues.
|
||||||
|
|
||||||
|
|
||||||
|
wr_ tunables
|
||||||
|
------------
|
||||||
|
|
||||||
|
BFQ exports a few parameters to control/tune the behavior of
|
||||||
|
low-latency heuristics.
|
||||||
|
|
||||||
|
wr_coeff
|
||||||
|
|
||||||
|
Factor by which the weight of a weight-raised queue is multiplied. If
|
||||||
|
the queue is deemed soft real-time, then the weight is further
|
||||||
|
multiplied by an additional, constant factor.
|
||||||
|
|
||||||
|
wr_max_time
|
||||||
|
|
||||||
|
Maximum duration of a weight-raising period for an interactive task
|
||||||
|
(ms). If set to zero (default value), then this value is computed
|
||||||
|
automatically, as a function of the peak rate of the device. In any
|
||||||
|
case, when the value of this parameter is read, it always reports the
|
||||||
|
current duration, regardless of whether it has been set manually or
|
||||||
|
computed automatically.
|
||||||
|
|
||||||
|
wr_max_softrt_rate
|
||||||
|
|
||||||
|
Maximum service rate below which a queue is deemed to be associated
|
||||||
|
with a soft real-time application, and is then weight-raised
|
||||||
|
accordingly (sectors/sec).
|
||||||
|
|
||||||
|
wr_min_idle_time
|
||||||
|
|
||||||
|
Minimum idle period after which interactive weight-raising may be
|
||||||
|
reactivated for a queue (in ms).
|
||||||
|
|
||||||
|
wr_rt_max_time
|
||||||
|
|
||||||
|
Maximum weight-raising duration for soft real-time queues (in ms). The
|
||||||
|
start time from which this duration is considered is automatically
|
||||||
|
moved forward if the queue is detected to be still soft real-time
|
||||||
|
before the current soft real-time weight-raising period finishes.
|
||||||
|
|
||||||
|
wr_min_inter_arr_async
|
||||||
|
|
||||||
|
Minimum period between I/O request arrivals after which weight-raising
|
||||||
|
may be reactivated for an already busy async queue (in ms).
|
||||||
|
|
||||||
|
|
||||||
|
4. Group scheduling with BFQ
|
||||||
|
============================
|
||||||
|
|
||||||
|
BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
|
||||||
|
blkio and io. In particular, BFQ supports weight-based proportional
|
||||||
|
share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
|
||||||
|
|
||||||
|
4-1 Service guarantees provided
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
With BFQ, proportional share means true proportional share of the
|
||||||
|
device bandwidth, according to group weights. For example, a group
|
||||||
|
with weight 200 gets twice the bandwidth, and not just twice the time,
|
||||||
|
of a group with weight 100.
|
||||||
|
|
||||||
|
BFQ supports hierarchies (group trees) of any depth. Bandwidth is
|
||||||
|
distributed among groups and processes in the expected way: for each
|
||||||
|
group, the children of the group share the whole bandwidth of the
|
||||||
|
group in proportion to their weights. In particular, this implies
|
||||||
|
that, for each leaf group, every process of the group receives the
|
||||||
|
same share of the whole group bandwidth, unless the ioprio of the
|
||||||
|
process is modified.
|
||||||
|
|
||||||
|
The resource-sharing guarantee for a group may partially or totally
|
||||||
|
switch from bandwidth to time, if providing bandwidth guarantees to
|
||||||
|
the group lowers the throughput too much. This switch occurs on a
|
||||||
|
per-process basis: if a process of a leaf group causes throughput loss
|
||||||
|
if served in such a way to receive its share of the bandwidth, then
|
||||||
|
BFQ switches back to just time-based proportional share for that
|
||||||
|
process.
|
||||||
|
|
||||||
|
4-2 Interface
|
||||||
|
-------------
|
||||||
|
|
||||||
|
To get proportional sharing of bandwidth with BFQ for a given device,
|
||||||
|
BFQ must of course be the active scheduler for that device.
|
||||||
|
|
||||||
|
Within each group directory, the names of the files associated with
|
||||||
|
BFQ-specific cgroup parameters and stats begin with the "bfq."
|
||||||
|
prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
|
||||||
|
BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
|
||||||
|
parameter to set the weight of a group with BFQ is blkio.bfq.weight
|
||||||
|
or io.bfq.weight.
|
||||||
|
|
||||||
|
Parameters to set
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
For each group, there is only the following parameter to set.
|
||||||
|
|
||||||
|
weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
|
||||||
|
group inside its parent. Available values: 1..10000 (default 100). The
|
||||||
|
linear mapping between ioprio and weights, described at the beginning
|
||||||
|
of the tunable section, is still valid, but all weights higher than
|
||||||
|
IOPRIO_BE_NR*10 are mapped to ioprio 0.
|
||||||
|
|
||||||
|
Recall that, if low-latency is set, then BFQ automatically raises the
|
||||||
|
weight of the queues associated with interactive and soft real-time
|
||||||
|
applications. Unset this tunable if you need/want to control weights.
|
||||||
|
|
||||||
|
|
||||||
|
[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
|
||||||
|
Scheduler", Proceedings of the First Workshop on Mobile System
|
||||||
|
Technologies (MST-2015), May 2015.
|
||||||
|
http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
|
||||||
|
|
||||||
|
[2] P. Valente and M. Andreolini, "Improving Application
|
||||||
|
Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
|
||||||
|
the 5th Annual International Systems and Storage Conference
|
||||||
|
(SYSTOR '12), June 2012.
|
||||||
|
Slightly extended version:
|
||||||
|
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
||||||
|
results.pdf
|
14
Documentation/block/kyber-iosched.txt
Normal file
14
Documentation/block/kyber-iosched.txt
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
Kyber I/O scheduler tunables
|
||||||
|
===========================
|
||||||
|
|
||||||
|
The only two tunables for the Kyber scheduler are the target latencies for
|
||||||
|
reads and synchronous writes. Kyber will throttle requests in order to meet
|
||||||
|
these target latencies.
|
||||||
|
|
||||||
|
read_lat_nsec
|
||||||
|
-------------
|
||||||
|
Target latency for reads (in nanoseconds).
|
||||||
|
|
||||||
|
write_lat_nsec
|
||||||
|
--------------
|
||||||
|
Target latency for synchronous writes (in nanoseconds).
|
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
|
|||||||
smaller discards and potentially help reduce latencies induced by large
|
smaller discards and potentially help reduce latencies induced by large
|
||||||
discard operations.
|
discard operations.
|
||||||
|
|
||||||
discard_zeroes_data (RO)
|
|
||||||
------------------------
|
|
||||||
When read, this file will show if the discarded block are zeroed by the
|
|
||||||
device or not. If its value is '1' the blocks are zeroed otherwise not.
|
|
||||||
|
|
||||||
hw_sector_size (RO)
|
hw_sector_size (RO)
|
||||||
-------------------
|
-------------------
|
||||||
This is the hardware sector size of the device, in bytes.
|
This is the hardware sector size of the device, in bytes.
|
||||||
@@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the
|
|||||||
feature. Writing a value of '-1' to this file resets the value to the
|
feature. Writing a value of '-1' to this file resets the value to the
|
||||||
default setting.
|
default setting.
|
||||||
|
|
||||||
|
throttle_sample_time (RW)
|
||||||
|
-------------------------
|
||||||
|
This is the time window that blk-throttle samples data, in millisecond.
|
||||||
|
blk-throttle makes decision based on the samplings. Lower time means cgroups
|
||||||
|
have more smooth throughput, but higher CPU overhead. This exists only when
|
||||||
|
CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
|
||||||
|
|
||||||
Jens Axboe <jens.axboe@oracle.com>, February 2009
|
Jens Axboe <jens.axboe@oracle.com>, February 2009
|
||||||
|
@@ -1,84 +0,0 @@
|
|||||||
This document describes m[g]flash support in linux.
|
|
||||||
|
|
||||||
Contents
|
|
||||||
1. Overview
|
|
||||||
2. Reserved area configuration
|
|
||||||
3. Example of mflash platform driver registration
|
|
||||||
|
|
||||||
1. Overview
|
|
||||||
|
|
||||||
Mflash and gflash are embedded flash drive. The only difference is mflash is
|
|
||||||
MCP(Multi Chip Package) device. These two device operate exactly same way.
|
|
||||||
So the rest mflash repersents mflash and gflash altogether.
|
|
||||||
|
|
||||||
Internally, mflash has nand flash and other hardware logics and supports
|
|
||||||
2 different operation (ATA, IO) modes. ATA mode doesn't need any new
|
|
||||||
driver and currently works well under standard IDE subsystem. Actually it's
|
|
||||||
one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
|
|
||||||
IDE interface.
|
|
||||||
|
|
||||||
Following are brief descriptions about IO mode.
|
|
||||||
A. IO mode based on ATA protocol and uses some custom command. (read confirm,
|
|
||||||
write confirm)
|
|
||||||
B. IO mode uses SRAM bus interface.
|
|
||||||
C. IO mode supports 4kB boot area, so host can boot from mflash.
|
|
||||||
|
|
||||||
2. Reserved area configuration
|
|
||||||
If host boot from mflash, usually needs raw area for boot loader image. All of
|
|
||||||
the mflash's block device operation will be taken this value as start offset.
|
|
||||||
Note that boot loader's size of reserved area and kernel configuration value
|
|
||||||
must be same.
|
|
||||||
|
|
||||||
3. Example of mflash platform driver registration
|
|
||||||
Working mflash is very straight forward. Adding platform device stuff to board
|
|
||||||
configuration file is all. Here is some pseudo example.
|
|
||||||
|
|
||||||
static struct mg_drv_data mflash_drv_data = {
|
|
||||||
/* If you want to polling driver set to 1 */
|
|
||||||
.use_polling = 0,
|
|
||||||
/* device attribution */
|
|
||||||
.dev_attr = MG_BOOT_DEV
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct resource mg_mflash_rsc[] = {
|
|
||||||
/* Base address of mflash */
|
|
||||||
[0] = {
|
|
||||||
.start = 0x08000000,
|
|
||||||
.end = 0x08000000 + SZ_64K - 1,
|
|
||||||
.flags = IORESOURCE_MEM
|
|
||||||
},
|
|
||||||
/* mflash interrupt pin */
|
|
||||||
[1] = {
|
|
||||||
.start = IRQ_GPIO(84),
|
|
||||||
.end = IRQ_GPIO(84),
|
|
||||||
.flags = IORESOURCE_IRQ
|
|
||||||
},
|
|
||||||
/* mflash reset pin */
|
|
||||||
[2] = {
|
|
||||||
.start = 43,
|
|
||||||
.end = 43,
|
|
||||||
.name = MG_RST_PIN,
|
|
||||||
.flags = IORESOURCE_IO
|
|
||||||
},
|
|
||||||
/* mflash reset-out pin
|
|
||||||
* If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
|
|
||||||
* should assign this */
|
|
||||||
[3] = {
|
|
||||||
.start = 51,
|
|
||||||
.end = 51,
|
|
||||||
.name = MG_RSTOUT_PIN,
|
|
||||||
.flags = IORESOURCE_IO
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct platform_device mflash_dev = {
|
|
||||||
.name = MG_DEV_NAME,
|
|
||||||
.id = -1,
|
|
||||||
.dev = {
|
|
||||||
.platform_data = &mflash_drv_data,
|
|
||||||
},
|
|
||||||
.num_resources = ARRAY_SIZE(mg_mflash_rsc),
|
|
||||||
.resource = mg_mflash_rsc
|
|
||||||
};
|
|
||||||
|
|
||||||
platform_device_register(&mflash_dev);
|
|
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
|
|||||||
|
|
||||||
Amount of memory used in network transmission buffers
|
Amount of memory used in network transmission buffers
|
||||||
|
|
||||||
|
shmem
|
||||||
|
|
||||||
|
Amount of cached filesystem data that is swap-backed,
|
||||||
|
such as tmpfs, shm segments, shared anonymous mmap()s
|
||||||
|
|
||||||
file_mapped
|
file_mapped
|
||||||
|
|
||||||
Amount of cached filesystem data mapped with mmap()
|
Amount of cached filesystem data mapped with mmap()
|
||||||
@@ -913,6 +918,18 @@ PAGE_SIZE multiple when read back.
|
|||||||
|
|
||||||
Number of major page faults incurred
|
Number of major page faults incurred
|
||||||
|
|
||||||
|
workingset_refault
|
||||||
|
|
||||||
|
Number of refaults of previously evicted pages
|
||||||
|
|
||||||
|
workingset_activate
|
||||||
|
|
||||||
|
Number of refaulted pages that were immediately activated
|
||||||
|
|
||||||
|
workingset_nodereclaim
|
||||||
|
|
||||||
|
Number of times a shadow node has been reclaimed
|
||||||
|
|
||||||
memory.swap.current
|
memory.swap.current
|
||||||
|
|
||||||
A read-only single value file which exists on non-root
|
A read-only single value file which exists on non-root
|
||||||
|
@@ -17,7 +17,7 @@ import os
|
|||||||
import sphinx
|
import sphinx
|
||||||
|
|
||||||
# Get Sphinx version
|
# Get Sphinx version
|
||||||
major, minor, patch = map(int, sphinx.__version__.split("."))
|
major, minor, patch = sphinx.version_info[:3]
|
||||||
|
|
||||||
|
|
||||||
# If extensions (or modules to document with autodoc) are in another directory,
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
@@ -29,12 +29,12 @@ from load_config import loadConfig
|
|||||||
# -- General configuration ------------------------------------------------
|
# -- General configuration ------------------------------------------------
|
||||||
|
|
||||||
# If your documentation needs a minimal Sphinx version, state it here.
|
# If your documentation needs a minimal Sphinx version, state it here.
|
||||||
#needs_sphinx = '1.0'
|
needs_sphinx = '1.2'
|
||||||
|
|
||||||
# Add any Sphinx extension module names here, as strings. They can be
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
# ones.
|
# ones.
|
||||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain']
|
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure']
|
||||||
|
|
||||||
# The name of the math extension changed on Sphinx 1.4
|
# The name of the math extension changed on Sphinx 1.4
|
||||||
if major == 1 and minor > 3:
|
if major == 1 and minor > 3:
|
||||||
@@ -348,6 +348,8 @@ latex_documents = [
|
|||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
|
('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
|
||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
|
('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
|
||||||
|
'The kernel development community', 'manual'),
|
||||||
('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
|
('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
|
||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
|
('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
|
||||||
|
130
Documentation/core-api/flexible-arrays.rst
Normal file
130
Documentation/core-api/flexible-arrays.rst
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
|
||||||
|
===================================
|
||||||
|
Using flexible arrays in the kernel
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Large contiguous memory allocations can be unreliable in the Linux kernel.
|
||||||
|
Kernel programmers will sometimes respond to this problem by allocating
|
||||||
|
pages with :c:func:`vmalloc()`. This solution not ideal, though. On 32-bit
|
||||||
|
systems, memory from vmalloc() must be mapped into a relatively small address
|
||||||
|
space; it's easy to run out. On SMP systems, the page table changes required
|
||||||
|
by vmalloc() allocations can require expensive cross-processor interrupts on
|
||||||
|
all CPUs. And, on all systems, use of space in the vmalloc() range increases
|
||||||
|
pressure on the translation lookaside buffer (TLB), reducing the performance
|
||||||
|
of the system.
|
||||||
|
|
||||||
|
In many cases, the need for memory from vmalloc() can be eliminated by piecing
|
||||||
|
together an array from smaller parts; the flexible array library exists to make
|
||||||
|
this task easier.
|
||||||
|
|
||||||
|
A flexible array holds an arbitrary (within limits) number of fixed-sized
|
||||||
|
objects, accessed via an integer index. Sparse arrays are handled
|
||||||
|
reasonably well. Only single-page allocations are made, so memory
|
||||||
|
allocation failures should be relatively rare. The down sides are that the
|
||||||
|
arrays cannot be indexed directly, individual object size cannot exceed the
|
||||||
|
system page size, and putting data into a flexible array requires a copy
|
||||||
|
operation. It's also worth noting that flexible arrays do no internal
|
||||||
|
locking at all; if concurrent access to an array is possible, then the
|
||||||
|
caller must arrange for appropriate mutual exclusion.
|
||||||
|
|
||||||
|
The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
|
||||||
|
|
||||||
|
#include <linux/flex_array.h>
|
||||||
|
|
||||||
|
struct flex_array *flex_array_alloc(int element_size,
|
||||||
|
unsigned int total,
|
||||||
|
gfp_t flags);
|
||||||
|
|
||||||
|
The individual object size is provided by ``element_size``, while total is the
|
||||||
|
maximum number of objects which can be stored in the array. The flags
|
||||||
|
argument is passed directly to the internal memory allocation calls. With
|
||||||
|
the current code, using flags to ask for high memory is likely to lead to
|
||||||
|
notably unpleasant side effects.
|
||||||
|
|
||||||
|
It is also possible to define flexible arrays at compile time with::
|
||||||
|
|
||||||
|
DEFINE_FLEX_ARRAY(name, element_size, total);
|
||||||
|
|
||||||
|
This macro will result in a definition of an array with the given name; the
|
||||||
|
element size and total will be checked for validity at compile time.
|
||||||
|
|
||||||
|
Storing data into a flexible array is accomplished with a call to
|
||||||
|
:c:func:`flex_array_put()`::
|
||||||
|
|
||||||
|
int flex_array_put(struct flex_array *array, unsigned int element_nr,
|
||||||
|
void *src, gfp_t flags);
|
||||||
|
|
||||||
|
This call will copy the data from src into the array, in the position
|
||||||
|
indicated by ``element_nr`` (which must be less than the maximum specified when
|
||||||
|
the array was created). If any memory allocations must be performed, flags
|
||||||
|
will be used. The return value is zero on success, a negative error code
|
||||||
|
otherwise.
|
||||||
|
|
||||||
|
There might possibly be a need to store data into a flexible array while
|
||||||
|
running in some sort of atomic context; in this situation, sleeping in the
|
||||||
|
memory allocator would be a bad thing. That can be avoided by using
|
||||||
|
``GFP_ATOMIC`` for the flags value, but, often, there is a better way. The
|
||||||
|
trick is to ensure that any needed memory allocations are done before
|
||||||
|
entering atomic context, using :c:func:`flex_array_prealloc()`::
|
||||||
|
|
||||||
|
int flex_array_prealloc(struct flex_array *array, unsigned int start,
|
||||||
|
unsigned int nr_elements, gfp_t flags);
|
||||||
|
|
||||||
|
This function will ensure that memory for the elements indexed in the range
|
||||||
|
defined by ``start`` and ``nr_elements`` has been allocated. Thereafter, a
|
||||||
|
``flex_array_put()`` call on an element in that range is guaranteed not to
|
||||||
|
block.
|
||||||
|
|
||||||
|
Getting data back out of the array is done with :c:func:`flex_array_get()`::
|
||||||
|
|
||||||
|
void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
|
||||||
|
|
||||||
|
The return value is a pointer to the data element, or NULL if that
|
||||||
|
particular element has never been allocated.
|
||||||
|
|
||||||
|
Note that it is possible to get back a valid pointer for an element which
|
||||||
|
has never been stored in the array. Memory for array elements is allocated
|
||||||
|
one page at a time; a single allocation could provide memory for several
|
||||||
|
adjacent elements. Flexible array elements are normally initialized to the
|
||||||
|
value ``FLEX_ARRAY_FREE`` (defined as 0x6c in <linux/poison.h>), so errors
|
||||||
|
involving that number probably result from use of unstored array entries.
|
||||||
|
Note that, if array elements are allocated with ``__GFP_ZERO``, they will be
|
||||||
|
initialized to zero and this poisoning will not happen.
|
||||||
|
|
||||||
|
Individual elements in the array can be cleared with
|
||||||
|
:c:func:`flex_array_clear()`::
|
||||||
|
|
||||||
|
int flex_array_clear(struct flex_array *array, unsigned int element_nr);
|
||||||
|
|
||||||
|
This function will set the given element to ``FLEX_ARRAY_FREE`` and return
|
||||||
|
zero. If storage for the indicated element is not allocated for the array,
|
||||||
|
``flex_array_clear()`` will return ``-EINVAL`` instead. Note that clearing an
|
||||||
|
element does not release the storage associated with it; to reduce the
|
||||||
|
allocated size of an array, call :c:func:`flex_array_shrink()`::
|
||||||
|
|
||||||
|
int flex_array_shrink(struct flex_array *array);
|
||||||
|
|
||||||
|
The return value will be the number of pages of memory actually freed.
|
||||||
|
This function works by scanning the array for pages containing nothing but
|
||||||
|
``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work
|
||||||
|
if the array's pages are allocated with ``__GFP_ZERO``.
|
||||||
|
|
||||||
|
It is possible to remove all elements of an array with a call to
|
||||||
|
:c:func:`flex_array_free_parts()`::
|
||||||
|
|
||||||
|
void flex_array_free_parts(struct flex_array *array);
|
||||||
|
|
||||||
|
This call frees all elements, but leaves the array itself in place.
|
||||||
|
Freeing the entire array is done with :c:func:`flex_array_free()`::
|
||||||
|
|
||||||
|
void flex_array_free(struct flex_array *array);
|
||||||
|
|
||||||
|
As of this writing, there are no users of flexible arrays in the mainline
|
||||||
|
kernel. The functions described here are also not exported to modules;
|
||||||
|
that will probably be fixed when somebody comes up with a need for it.
|
||||||
|
|
||||||
|
|
||||||
|
Flexible array functions
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/flex_array.h
|
440
Documentation/core-api/genericirq.rst
Normal file
440
Documentation/core-api/genericirq.rst
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
|
==========================
|
||||||
|
Linux generic IRQ handling
|
||||||
|
==========================
|
||||||
|
|
||||||
|
:Copyright: |copy| 2005-2010: Thomas Gleixner
|
||||||
|
:Copyright: |copy| 2005-2006: Ingo Molnar
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
The generic interrupt handling layer is designed to provide a complete
|
||||||
|
abstraction of interrupt handling for device drivers. It is able to
|
||||||
|
handle all the different types of interrupt controller hardware. Device
|
||||||
|
drivers use generic API functions to request, enable, disable and free
|
||||||
|
interrupts. The drivers do not have to know anything about interrupt
|
||||||
|
hardware details, so they can be used on different platforms without
|
||||||
|
code changes.
|
||||||
|
|
||||||
|
This documentation is provided to developers who want to implement an
|
||||||
|
interrupt subsystem based for their architecture, with the help of the
|
||||||
|
generic IRQ handling layer.
|
||||||
|
|
||||||
|
Rationale
|
||||||
|
=========
|
||||||
|
|
||||||
|
The original implementation of interrupt handling in Linux uses the
|
||||||
|
:c:func:`__do_IRQ` super-handler, which is able to deal with every type of
|
||||||
|
interrupt logic.
|
||||||
|
|
||||||
|
Originally, Russell King identified different types of handlers to build
|
||||||
|
a quite universal set for the ARM interrupt handler implementation in
|
||||||
|
Linux 2.5/2.6. He distinguished between:
|
||||||
|
|
||||||
|
- Level type
|
||||||
|
|
||||||
|
- Edge type
|
||||||
|
|
||||||
|
- Simple type
|
||||||
|
|
||||||
|
During the implementation we identified another type:
|
||||||
|
|
||||||
|
- Fast EOI type
|
||||||
|
|
||||||
|
In the SMP world of the :c:func:`__do_IRQ` super-handler another type was
|
||||||
|
identified:
|
||||||
|
|
||||||
|
- Per CPU type
|
||||||
|
|
||||||
|
This split implementation of high-level IRQ handlers allows us to
|
||||||
|
optimize the flow of the interrupt handling for each specific interrupt
|
||||||
|
type. This reduces complexity in that particular code path and allows
|
||||||
|
the optimized handling of a given type.
|
||||||
|
|
||||||
|
The original general IRQ implementation used hw_interrupt_type
|
||||||
|
structures and their ``->ack``, ``->end`` [etc.] callbacks to differentiate
|
||||||
|
the flow control in the super-handler. This leads to a mix of flow logic
|
||||||
|
and low-level hardware logic, and it also leads to unnecessary code
|
||||||
|
duplication: for example in i386, there is an ``ioapic_level_irq`` and an
|
||||||
|
``ioapic_edge_irq`` IRQ-type which share many of the low-level details but
|
||||||
|
have different flow handling.
|
||||||
|
|
||||||
|
A more natural abstraction is the clean separation of the 'irq flow' and
|
||||||
|
the 'chip details'.
|
||||||
|
|
||||||
|
Analysing a couple of architecture's IRQ subsystem implementations
|
||||||
|
reveals that most of them can use a generic set of 'irq flow' methods
|
||||||
|
and only need to add the chip-level specific code. The separation is
|
||||||
|
also valuable for (sub)architectures which need specific quirks in the
|
||||||
|
IRQ flow itself but not in the chip details - and thus provides a more
|
||||||
|
transparent IRQ subsystem design.
|
||||||
|
|
||||||
|
Each interrupt descriptor is assigned its own high-level flow handler,
|
||||||
|
which is normally one of the generic implementations. (This high-level
|
||||||
|
flow handler implementation also makes it simple to provide
|
||||||
|
demultiplexing handlers which can be found in embedded platforms on
|
||||||
|
various architectures.)
|
||||||
|
|
||||||
|
The separation makes the generic interrupt handling layer more flexible
|
||||||
|
and extensible. For example, an (sub)architecture can use a generic
|
||||||
|
IRQ-flow implementation for 'level type' interrupts and add a
|
||||||
|
(sub)architecture specific 'edge type' implementation.
|
||||||
|
|
||||||
|
To make the transition to the new model easier and prevent the breakage
|
||||||
|
of existing implementations, the :c:func:`__do_IRQ` super-handler is still
|
||||||
|
available. This leads to a kind of duality for the time being. Over time
|
||||||
|
the new model should be used in more and more architectures, as it
|
||||||
|
enables smaller and cleaner IRQ subsystems. It's deprecated for three
|
||||||
|
years now and about to be removed.
|
||||||
|
|
||||||
|
Known Bugs And Assumptions
|
||||||
|
==========================
|
||||||
|
|
||||||
|
None (knock on wood).
|
||||||
|
|
||||||
|
Abstraction layers
|
||||||
|
==================
|
||||||
|
|
||||||
|
There are three main levels of abstraction in the interrupt code:
|
||||||
|
|
||||||
|
1. High-level driver API
|
||||||
|
|
||||||
|
2. High-level IRQ flow handlers
|
||||||
|
|
||||||
|
3. Chip-level hardware encapsulation
|
||||||
|
|
||||||
|
Interrupt control flow
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Each interrupt is described by an interrupt descriptor structure
|
||||||
|
irq_desc. The interrupt is referenced by an 'unsigned int' numeric
|
||||||
|
value which selects the corresponding interrupt description structure in
|
||||||
|
the descriptor structures array. The descriptor structure contains
|
||||||
|
status information and pointers to the interrupt flow method and the
|
||||||
|
interrupt chip structure which are assigned to this interrupt.
|
||||||
|
|
||||||
|
Whenever an interrupt triggers, the low-level architecture code calls
|
||||||
|
into the generic interrupt code by calling :c:func:`desc->handle_irq`. This
|
||||||
|
high-level IRQ handling function only uses desc->irq_data.chip
|
||||||
|
primitives referenced by the assigned chip descriptor structure.
|
||||||
|
|
||||||
|
High-level Driver API
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The high-level Driver API consists of following functions:
|
||||||
|
|
||||||
|
- :c:func:`request_irq`
|
||||||
|
|
||||||
|
- :c:func:`free_irq`
|
||||||
|
|
||||||
|
- :c:func:`disable_irq`
|
||||||
|
|
||||||
|
- :c:func:`enable_irq`
|
||||||
|
|
||||||
|
- :c:func:`disable_irq_nosync` (SMP only)
|
||||||
|
|
||||||
|
- :c:func:`synchronize_irq` (SMP only)
|
||||||
|
|
||||||
|
- :c:func:`irq_set_irq_type`
|
||||||
|
|
||||||
|
- :c:func:`irq_set_irq_wake`
|
||||||
|
|
||||||
|
- :c:func:`irq_set_handler_data`
|
||||||
|
|
||||||
|
- :c:func:`irq_set_chip`
|
||||||
|
|
||||||
|
- :c:func:`irq_set_chip_data`
|
||||||
|
|
||||||
|
See the autogenerated function documentation for details.
|
||||||
|
|
||||||
|
High-level IRQ flow handlers
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
The generic layer provides a set of pre-defined irq-flow methods:
|
||||||
|
|
||||||
|
- :c:func:`handle_level_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_edge_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_fasteoi_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_simple_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_percpu_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_edge_eoi_irq`
|
||||||
|
|
||||||
|
- :c:func:`handle_bad_irq`
|
||||||
|
|
||||||
|
The interrupt flow handlers (either pre-defined or architecture
|
||||||
|
specific) are assigned to specific interrupts by the architecture either
|
||||||
|
during bootup or during device initialization.
|
||||||
|
|
||||||
|
Default flow implementations
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Helper functions
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The helper functions call the chip primitives and are used by the
|
||||||
|
default flow implementations. The following helper functions are
|
||||||
|
implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
default_enable(struct irq_data *data)
|
||||||
|
{
|
||||||
|
desc->irq_data.chip->irq_unmask(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
default_disable(struct irq_data *data)
|
||||||
|
{
|
||||||
|
if (!delay_disable(data))
|
||||||
|
desc->irq_data.chip->irq_mask(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
default_ack(struct irq_data *data)
|
||||||
|
{
|
||||||
|
chip->irq_ack(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
default_mask_ack(struct irq_data *data)
|
||||||
|
{
|
||||||
|
if (chip->irq_mask_ack) {
|
||||||
|
chip->irq_mask_ack(data);
|
||||||
|
} else {
|
||||||
|
chip->irq_mask(data);
|
||||||
|
chip->irq_ack(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
noop(struct irq_data *data))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Default flow handler implementations
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Default Level IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_level_irq provides a generic implementation for level-triggered
|
||||||
|
interrupts.
|
||||||
|
|
||||||
|
The following control flow is implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
:c:func:`desc->irq_data.chip->irq_mask_ack`;
|
||||||
|
handle_irq_event(desc->action);
|
||||||
|
:c:func:`desc->irq_data.chip->irq_unmask`;
|
||||||
|
|
||||||
|
|
||||||
|
Default Fast EOI IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_fasteoi_irq provides a generic implementation for interrupts,
|
||||||
|
which only need an EOI at the end of the handler.
|
||||||
|
|
||||||
|
The following control flow is implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
handle_irq_event(desc->action);
|
||||||
|
:c:func:`desc->irq_data.chip->irq_eoi`;
|
||||||
|
|
||||||
|
|
||||||
|
Default Edge IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_edge_irq provides a generic implementation for edge-triggered
|
||||||
|
interrupts.
|
||||||
|
|
||||||
|
The following control flow is implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
if (desc->status & running) {
|
||||||
|
:c:func:`desc->irq_data.chip->irq_mask_ack`;
|
||||||
|
desc->status |= pending | masked;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
:c:func:`desc->irq_data.chip->irq_ack`;
|
||||||
|
desc->status |= running;
|
||||||
|
do {
|
||||||
|
if (desc->status & masked)
|
||||||
|
:c:func:`desc->irq_data.chip->irq_unmask`;
|
||||||
|
desc->status &= ~pending;
|
||||||
|
handle_irq_event(desc->action);
|
||||||
|
} while (status & pending);
|
||||||
|
desc->status &= ~running;
|
||||||
|
|
||||||
|
|
||||||
|
Default simple IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_simple_irq provides a generic implementation for simple
|
||||||
|
interrupts.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The simple flow handler does not call any handler/chip primitives.
|
||||||
|
|
||||||
|
The following control flow is implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
handle_irq_event(desc->action);
|
||||||
|
|
||||||
|
|
||||||
|
Default per CPU flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_percpu_irq provides a generic implementation for per CPU
|
||||||
|
interrupts.
|
||||||
|
|
||||||
|
Per CPU interrupts are only available on SMP and the handler provides a
|
||||||
|
simplified version without locking.
|
||||||
|
|
||||||
|
The following control flow is implemented (simplified excerpt)::
|
||||||
|
|
||||||
|
if (desc->irq_data.chip->irq_ack)
|
||||||
|
:c:func:`desc->irq_data.chip->irq_ack`;
|
||||||
|
handle_irq_event(desc->action);
|
||||||
|
if (desc->irq_data.chip->irq_eoi)
|
||||||
|
:c:func:`desc->irq_data.chip->irq_eoi`;
|
||||||
|
|
||||||
|
|
||||||
|
EOI Edge IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_edge_eoi_irq provides an abnomination of the edge handler
|
||||||
|
which is solely used to tame a badly wreckaged irq controller on
|
||||||
|
powerpc/cell.
|
||||||
|
|
||||||
|
Bad IRQ flow handler
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
handle_bad_irq is used for spurious interrupts which have no real
|
||||||
|
handler assigned..
|
||||||
|
|
||||||
|
Quirks and optimizations
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The generic functions are intended for 'clean' architectures and chips,
|
||||||
|
which have no platform-specific IRQ handling quirks. If an architecture
|
||||||
|
needs to implement quirks on the 'flow' level then it can do so by
|
||||||
|
overriding the high-level irq-flow handler.
|
||||||
|
|
||||||
|
Delayed interrupt disable
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This per interrupt selectable feature, which was introduced by Russell
|
||||||
|
King in the ARM interrupt implementation, does not mask an interrupt at
|
||||||
|
the hardware level when :c:func:`disable_irq` is called. The interrupt is kept
|
||||||
|
enabled and is masked in the flow handler when an interrupt event
|
||||||
|
happens. This prevents losing edge interrupts on hardware which does not
|
||||||
|
store an edge interrupt event while the interrupt is disabled at the
|
||||||
|
hardware level. When an interrupt arrives while the IRQ_DISABLED flag
|
||||||
|
is set, then the interrupt is masked at the hardware level and the
|
||||||
|
IRQ_PENDING bit is set. When the interrupt is re-enabled by
|
||||||
|
:c:func:`enable_irq` the pending bit is checked and if it is set, the interrupt
|
||||||
|
is resent either via hardware or by a software resend mechanism. (It's
|
||||||
|
necessary to enable CONFIG_HARDIRQS_SW_RESEND when you want to use
|
||||||
|
the delayed interrupt disable feature and your hardware is not capable
|
||||||
|
of retriggering an interrupt.) The delayed interrupt disable is not
|
||||||
|
configurable.
|
||||||
|
|
||||||
|
Chip-level hardware encapsulation
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
The chip-level hardware descriptor structure :c:type:`irq_chip` contains all
|
||||||
|
the direct chip relevant functions, which can be utilized by the irq flow
|
||||||
|
implementations.
|
||||||
|
|
||||||
|
- ``irq_ack``
|
||||||
|
|
||||||
|
- ``irq_mask_ack`` - Optional, recommended for performance
|
||||||
|
|
||||||
|
- ``irq_mask``
|
||||||
|
|
||||||
|
- ``irq_unmask``
|
||||||
|
|
||||||
|
- ``irq_eoi`` - Optional, required for EOI flow handlers
|
||||||
|
|
||||||
|
- ``irq_retrigger`` - Optional
|
||||||
|
|
||||||
|
- ``irq_set_type`` - Optional
|
||||||
|
|
||||||
|
- ``irq_set_wake`` - Optional
|
||||||
|
|
||||||
|
These primitives are strictly intended to mean what they say: ack means
|
||||||
|
ACK, masking means masking of an IRQ line, etc. It is up to the flow
|
||||||
|
handler(s) to use these basic units of low-level functionality.
|
||||||
|
|
||||||
|
__do_IRQ entry point
|
||||||
|
====================
|
||||||
|
|
||||||
|
The original implementation :c:func:`__do_IRQ` was an alternative entry point
|
||||||
|
for all types of interrupts. It no longer exists.
|
||||||
|
|
||||||
|
This handler turned out to be not suitable for all interrupt hardware
|
||||||
|
and was therefore reimplemented with split functionality for
|
||||||
|
edge/level/simple/percpu interrupts. This is not only a functional
|
||||||
|
optimization. It also shortens code paths for interrupts.
|
||||||
|
|
||||||
|
Locking on SMP
|
||||||
|
==============
|
||||||
|
|
||||||
|
The locking of chip registers is up to the architecture that defines the
|
||||||
|
chip primitives. The per-irq structure is protected via desc->lock, by
|
||||||
|
the generic layer.
|
||||||
|
|
||||||
|
Generic interrupt chip
|
||||||
|
======================
|
||||||
|
|
||||||
|
To avoid copies of identical implementations of IRQ chips the core
|
||||||
|
provides a configurable generic interrupt chip implementation.
|
||||||
|
Developers should check carefully whether the generic chip fits their
|
||||||
|
needs before implementing the same functionality slightly differently
|
||||||
|
themselves.
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/generic-chip.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Structures
|
||||||
|
==========
|
||||||
|
|
||||||
|
This chapter contains the autogenerated documentation of the structures
|
||||||
|
which are used in the generic IRQ layer.
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/irq.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/interrupt.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Public Functions Provided
|
||||||
|
=========================
|
||||||
|
|
||||||
|
This chapter contains the autogenerated documentation of the kernel API
|
||||||
|
functions which are exported.
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/manage.c
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/chip.c
|
||||||
|
|
||||||
|
Internal Functions Provided
|
||||||
|
===========================
|
||||||
|
|
||||||
|
This chapter contains the autogenerated documentation of the internal
|
||||||
|
functions.
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/irqdesc.c
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/handle.c
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/chip.c
|
||||||
|
|
||||||
|
Credits
|
||||||
|
=======
|
||||||
|
|
||||||
|
The following people have contributed to this document:
|
||||||
|
|
||||||
|
1. Thomas Gleixner tglx@linutronix.de
|
||||||
|
|
||||||
|
2. Ingo Molnar mingo@elte.hu
|
@@ -11,11 +11,14 @@ Core utilities
|
|||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
|
kernel-api
|
||||||
assoc_array
|
assoc_array
|
||||||
atomic_ops
|
atomic_ops
|
||||||
cpu_hotplug
|
cpu_hotplug
|
||||||
local_ops
|
local_ops
|
||||||
workqueue
|
workqueue
|
||||||
|
genericirq
|
||||||
|
flexible-arrays
|
||||||
|
|
||||||
Interfaces for kernel debugging
|
Interfaces for kernel debugging
|
||||||
===============================
|
===============================
|
||||||
|
346
Documentation/core-api/kernel-api.rst
Normal file
346
Documentation/core-api/kernel-api.rst
Normal file
@@ -0,0 +1,346 @@
|
|||||||
|
====================
|
||||||
|
The Linux Kernel API
|
||||||
|
====================
|
||||||
|
|
||||||
|
Data Types
|
||||||
|
==========
|
||||||
|
|
||||||
|
Doubly Linked Lists
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/list.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Basic C Library Functions
|
||||||
|
=========================
|
||||||
|
|
||||||
|
When writing drivers, you cannot in general use routines which are from
|
||||||
|
the C Library. Some of the functions have been found generally useful
|
||||||
|
and they are listed below. The behaviour of these functions may vary
|
||||||
|
slightly from those defined by ANSI, and these deviations are noted in
|
||||||
|
the text.
|
||||||
|
|
||||||
|
String Conversions
|
||||||
|
------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/vsprintf.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/kernel.h
|
||||||
|
:functions: kstrtol
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/kernel.h
|
||||||
|
:functions: kstrtoul
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/kstrtox.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
String Manipulation
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/string.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Bit Operations
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/include/asm/bitops.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Basic Kernel Library Functions
|
||||||
|
==============================
|
||||||
|
|
||||||
|
The Linux kernel provides more basic utility functions.
|
||||||
|
|
||||||
|
Bitmap Operations
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/bitmap.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/bitmap.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Command-line Parsing
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/cmdline.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
CRC Functions
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/crc7.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/crc16.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/crc-itu-t.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/crc32.c
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/crc-ccitt.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
idr/ida Functions
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/idr.h
|
||||||
|
:doc: idr sync
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/idr.c
|
||||||
|
:doc: IDA description
|
||||||
|
|
||||||
|
.. kernel-doc:: lib/idr.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Memory Management in Linux
|
||||||
|
==========================
|
||||||
|
|
||||||
|
The Slab Cache
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/slab.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/slab.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/util.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
User Space Memory Access
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/include/asm/uaccess_32.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/lib/usercopy_32.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
More Memory Management Functions
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/readahead.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/filemap.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/memory.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/vmalloc.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/page_alloc.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/mempool.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/dmapool.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/page-writeback.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: mm/truncate.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Kernel IPC facilities
|
||||||
|
=====================
|
||||||
|
|
||||||
|
IPC utilities
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. kernel-doc:: ipc/util.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
FIFO Buffer
|
||||||
|
===========
|
||||||
|
|
||||||
|
kfifo interface
|
||||||
|
---------------
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/kfifo.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
relay interface support
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Relay interface support is designed to provide an efficient mechanism
|
||||||
|
for tools and facilities to relay large amounts of data from kernel
|
||||||
|
space to user space.
|
||||||
|
|
||||||
|
relay interface
|
||||||
|
---------------
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/relay.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/relay.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Module Support
|
||||||
|
==============
|
||||||
|
|
||||||
|
Module Loading
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/kmod.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Inter Module support
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Refer to the file kernel/module.c for more information.
|
||||||
|
|
||||||
|
Hardware Interfaces
|
||||||
|
===================
|
||||||
|
|
||||||
|
Interrupt Handling
|
||||||
|
------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/irq/manage.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
DMA Channels
|
||||||
|
------------
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/dma.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Resources Management
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/resource.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/resource.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
MTRR Handling
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Security Framework
|
||||||
|
==================
|
||||||
|
|
||||||
|
.. kernel-doc:: security/security.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: security/inode.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Audit Interfaces
|
||||||
|
================
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/audit.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/auditsc.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/auditfilter.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Accounting Framework
|
||||||
|
====================
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/acct.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
Block Devices
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-core.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-core.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-map.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-sysfs.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-settings.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-exec.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-flush.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-lib.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-tag.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-tag.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/blk-integrity.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: kernel/trace/blktrace.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/genhd.c
|
||||||
|
:internal:
|
||||||
|
|
||||||
|
.. kernel-doc:: block/genhd.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Char devices
|
||||||
|
============
|
||||||
|
|
||||||
|
.. kernel-doc:: fs/char_dev.c
|
||||||
|
:export:
|
||||||
|
|
||||||
|
Clock Framework
|
||||||
|
===============
|
||||||
|
|
||||||
|
The clock framework defines programming interfaces to support software
|
||||||
|
management of the system clock tree. This framework is widely used with
|
||||||
|
System-On-Chip (SOC) platforms to support power management and various
|
||||||
|
devices which may need custom clock rates. Note that these "clocks"
|
||||||
|
don't relate to timekeeping or real time clocks (RTCs), each of which
|
||||||
|
have separate frameworks. These :c:type:`struct clk <clk>`
|
||||||
|
instances may be used to manage for example a 96 MHz signal that is used
|
||||||
|
to shift bits into and out of peripherals or busses, or otherwise
|
||||||
|
trigger synchronous state machine transitions in system hardware.
|
||||||
|
|
||||||
|
Power management is supported by explicit software clock gating: unused
|
||||||
|
clocks are disabled, so the system doesn't waste power changing the
|
||||||
|
state of transistors that aren't in active use. On some systems this may
|
||||||
|
be backed by hardware clock gating, where clocks are gated without being
|
||||||
|
disabled in software. Sections of chips that are powered but not clocked
|
||||||
|
may be able to retain their last state. This low power state is often
|
||||||
|
called a *retention mode*. This mode still incurs leakage currents,
|
||||||
|
especially with finer circuit geometries, but for CMOS circuits power is
|
||||||
|
mostly used by clocked state changes.
|
||||||
|
|
||||||
|
Power-aware drivers only enable their clocks when the device they manage
|
||||||
|
is in active use. Also, system sleep states often differ according to
|
||||||
|
which clock domains are active: while a "standby" state may allow wakeup
|
||||||
|
from several active domains, a "mem" (suspend-to-RAM) state may require
|
||||||
|
a more wholesale shutdown of clocks derived from higher speed PLLs and
|
||||||
|
oscillators, limiting the number of possible wakeup event sources. A
|
||||||
|
driver's suspend method may need to be aware of system-specific clock
|
||||||
|
constraints on the target sleep state.
|
||||||
|
|
||||||
|
Some platforms support programmable clock generators. These can be used
|
||||||
|
by external chips of various kinds, such as other CPUs, multimedia
|
||||||
|
codecs, and devices with strict requirements for interface clocking.
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/clk.h
|
||||||
|
:internal:
|
@@ -1,93 +0,0 @@
|
|||||||
Processor boosting control
|
|
||||||
|
|
||||||
- information for users -
|
|
||||||
|
|
||||||
Quick guide for the impatient:
|
|
||||||
--------------------
|
|
||||||
/sys/devices/system/cpu/cpufreq/boost
|
|
||||||
controls the boost setting for the whole system. You can read and write
|
|
||||||
that file with either "0" (boosting disabled) or "1" (boosting allowed).
|
|
||||||
Reading or writing 1 does not mean that the system is boosting at this
|
|
||||||
very moment, but only that the CPU _may_ raise the frequency at it's
|
|
||||||
discretion.
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
Introduction
|
|
||||||
-------------
|
|
||||||
Some CPUs support a functionality to raise the operating frequency of
|
|
||||||
some cores in a multi-core package if certain conditions apply, mostly
|
|
||||||
if the whole chip is not fully utilized and below it's intended thermal
|
|
||||||
budget. The decision about boost disable/enable is made either at hardware
|
|
||||||
(e.g. x86) or software (e.g ARM).
|
|
||||||
On Intel CPUs this is called "Turbo Boost", AMD calls it "Turbo-Core",
|
|
||||||
in technical documentation "Core performance boost". In Linux we use
|
|
||||||
the term "boost" for convenience.
|
|
||||||
|
|
||||||
Rationale for disable switch
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
Though the idea is to just give better performance without any user
|
|
||||||
intervention, sometimes the need arises to disable this functionality.
|
|
||||||
Most systems offer a switch in the (BIOS) firmware to disable the
|
|
||||||
functionality at all, but a more fine-grained and dynamic control would
|
|
||||||
be desirable:
|
|
||||||
1. While running benchmarks, reproducible results are important. Since
|
|
||||||
the boosting functionality depends on the load of the whole package,
|
|
||||||
single thread performance can vary. By explicitly disabling the boost
|
|
||||||
functionality at least for the benchmark's run-time the system will run
|
|
||||||
at a fixed frequency and results are reproducible again.
|
|
||||||
2. To examine the impact of the boosting functionality it is helpful
|
|
||||||
to do tests with and without boosting.
|
|
||||||
3. Boosting means overclocking the processor, though under controlled
|
|
||||||
conditions. By raising the frequency and the voltage the processor
|
|
||||||
will consume more power than without the boosting, which may be
|
|
||||||
undesirable for instance for mobile users. Disabling boosting may
|
|
||||||
save power here, though this depends on the workload.
|
|
||||||
|
|
||||||
|
|
||||||
User controlled switch
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
To allow the user to toggle the boosting functionality, the cpufreq core
|
|
||||||
driver exports a sysfs knob to enable or disable it. There is a file:
|
|
||||||
/sys/devices/system/cpu/cpufreq/boost
|
|
||||||
which can either read "0" (boosting disabled) or "1" (boosting enabled).
|
|
||||||
The file is exported only when cpufreq driver supports boosting.
|
|
||||||
Explicitly changing the permissions and writing to that file anyway will
|
|
||||||
return EINVAL.
|
|
||||||
|
|
||||||
On supported CPUs one can write either a "0" or a "1" into this file.
|
|
||||||
This will either disable the boost functionality on all cores in the
|
|
||||||
whole system (0) or will allow the software or hardware to boost at will
|
|
||||||
(1).
|
|
||||||
|
|
||||||
Writing a "1" does not explicitly boost the system, but just allows the
|
|
||||||
CPU to boost at their discretion. Some implementations take external
|
|
||||||
factors like the chip's temperature into account, so boosting once does
|
|
||||||
not necessarily mean that it will occur every time even using the exact
|
|
||||||
same software setup.
|
|
||||||
|
|
||||||
|
|
||||||
AMD legacy cpb switch
|
|
||||||
---------------------
|
|
||||||
The AMD powernow-k8 driver used to support a very similar switch to
|
|
||||||
disable or enable the "Core Performance Boost" feature of some AMD CPUs.
|
|
||||||
This switch was instantiated in each CPU's cpufreq directory
|
|
||||||
(/sys/devices/system/cpu[0-9]*/cpufreq) and was called "cpb".
|
|
||||||
Though the per CPU existence hints at a more fine grained control, the
|
|
||||||
actual implementation only supported a system-global switch semantics,
|
|
||||||
which was simply reflected into each CPU's file. Writing a 0 or 1 into it
|
|
||||||
would pull the other CPUs to the same state.
|
|
||||||
For compatibility reasons this file and its behavior is still supported
|
|
||||||
on AMD CPUs, though it is now protected by a config switch
|
|
||||||
(X86_ACPI_CPUFREQ_CPB). On Intel CPUs this file will never be created,
|
|
||||||
even with the config option set.
|
|
||||||
This functionality is considered legacy and will be removed in some future
|
|
||||||
kernel version.
|
|
||||||
|
|
||||||
More fine grained boosting control
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
Technically it is possible to switch the boosting functionality at least
|
|
||||||
on a per package basis, for some CPUs even per core. Currently the driver
|
|
||||||
does not support it, but this may be implemented in the future.
|
|
@@ -231,7 +231,7 @@ the reference implementation in drivers/cpufreq/longrun.c
|
|||||||
Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset.
|
Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset.
|
||||||
|
|
||||||
get_intermediate should return a stable intermediate frequency platform wants to
|
get_intermediate should return a stable intermediate frequency platform wants to
|
||||||
switch to, and target_intermediate() should set CPU to to that frequency, before
|
switch to, and target_intermediate() should set CPU to that frequency, before
|
||||||
jumping to the frequency corresponding to 'index'. Core will take care of
|
jumping to the frequency corresponding to 'index'. Core will take care of
|
||||||
sending notifications and driver doesn't have to handle them in
|
sending notifications and driver doesn't have to handle them in
|
||||||
target_intermediate() or target_index().
|
target_intermediate() or target_index().
|
||||||
|
@@ -1,301 +0,0 @@
|
|||||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
|
||||||
|
|
||||||
|
|
||||||
L i n u x C P U F r e q
|
|
||||||
|
|
||||||
C P U F r e q G o v e r n o r s
|
|
||||||
|
|
||||||
- information for users and developers -
|
|
||||||
|
|
||||||
|
|
||||||
Dominik Brodowski <linux@brodo.de>
|
|
||||||
some additions and corrections by Nico Golde <nico@ngolde.de>
|
|
||||||
Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
Viresh Kumar <viresh.kumar@linaro.org>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
|
||||||
fly. This is a nice method to save battery power, because the lower
|
|
||||||
the clock speed, the less power the CPU consumes.
|
|
||||||
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
---------
|
|
||||||
1. What is a CPUFreq Governor?
|
|
||||||
|
|
||||||
2. Governors In the Linux Kernel
|
|
||||||
2.1 Performance
|
|
||||||
2.2 Powersave
|
|
||||||
2.3 Userspace
|
|
||||||
2.4 Ondemand
|
|
||||||
2.5 Conservative
|
|
||||||
2.6 Schedutil
|
|
||||||
|
|
||||||
3. The Governor Interface in the CPUfreq Core
|
|
||||||
|
|
||||||
4. References
|
|
||||||
|
|
||||||
|
|
||||||
1. What Is A CPUFreq Governor?
|
|
||||||
==============================
|
|
||||||
|
|
||||||
Most cpufreq drivers (except the intel_pstate and longrun) or even most
|
|
||||||
cpu frequency scaling algorithms only allow the CPU frequency to be set
|
|
||||||
to predefined fixed values. In order to offer dynamic frequency
|
|
||||||
scaling, the cpufreq core must be able to tell these drivers of a
|
|
||||||
"target frequency". So these specific drivers will be transformed to
|
|
||||||
offer a "->target/target_index/fast_switch()" call instead of the
|
|
||||||
"->setpolicy()" call. For set_policy drivers, all stays the same,
|
|
||||||
though.
|
|
||||||
|
|
||||||
How to decide what frequency within the CPUfreq policy should be used?
|
|
||||||
That's done using "cpufreq governors".
|
|
||||||
|
|
||||||
Basically, it's the following flow graph:
|
|
||||||
|
|
||||||
CPU can be set to switch independently | CPU can only be set
|
|
||||||
within specific "limits" | to specific frequencies
|
|
||||||
|
|
||||||
"CPUfreq policy"
|
|
||||||
consists of frequency limits (policy->{min,max})
|
|
||||||
and CPUfreq governor to be used
|
|
||||||
/ \
|
|
||||||
/ \
|
|
||||||
/ the cpufreq governor decides
|
|
||||||
/ (dynamically or statically)
|
|
||||||
/ what target_freq to set within
|
|
||||||
/ the limits of policy->{min,max}
|
|
||||||
/ \
|
|
||||||
/ \
|
|
||||||
Using the ->setpolicy call, Using the ->target/target_index/fast_switch call,
|
|
||||||
the limits and the the frequency closest
|
|
||||||
"policy" is set. to target_freq is set.
|
|
||||||
It is assured that it
|
|
||||||
is within policy->{min,max}
|
|
||||||
|
|
||||||
|
|
||||||
2. Governors In the Linux Kernel
|
|
||||||
================================
|
|
||||||
|
|
||||||
2.1 Performance
|
|
||||||
---------------
|
|
||||||
|
|
||||||
The CPUfreq governor "performance" sets the CPU statically to the
|
|
||||||
highest frequency within the borders of scaling_min_freq and
|
|
||||||
scaling_max_freq.
|
|
||||||
|
|
||||||
|
|
||||||
2.2 Powersave
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The CPUfreq governor "powersave" sets the CPU statically to the
|
|
||||||
lowest frequency within the borders of scaling_min_freq and
|
|
||||||
scaling_max_freq.
|
|
||||||
|
|
||||||
|
|
||||||
2.3 Userspace
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The CPUfreq governor "userspace" allows the user, or any userspace
|
|
||||||
program running with UID "root", to set the CPU to a specific frequency
|
|
||||||
by making a sysfs file "scaling_setspeed" available in the CPU-device
|
|
||||||
directory.
|
|
||||||
|
|
||||||
|
|
||||||
2.4 Ondemand
|
|
||||||
------------
|
|
||||||
|
|
||||||
The CPUfreq governor "ondemand" sets the CPU frequency depending on the
|
|
||||||
current system load. Load estimation is triggered by the scheduler
|
|
||||||
through the update_util_data->func hook; when triggered, cpufreq checks
|
|
||||||
the CPU-usage statistics over the last period and the governor sets the
|
|
||||||
CPU accordingly. The CPU must have the capability to switch the
|
|
||||||
frequency very quickly.
|
|
||||||
|
|
||||||
Sysfs files:
|
|
||||||
|
|
||||||
* sampling_rate:
|
|
||||||
|
|
||||||
Measured in uS (10^-6 seconds), this is how often you want the kernel
|
|
||||||
to look at the CPU usage and to make decisions on what to do about the
|
|
||||||
frequency. Typically this is set to values of around '10000' or more.
|
|
||||||
It's default value is (cmp. with users-guide.txt): transition_latency
|
|
||||||
* 1000. Be aware that transition latency is in ns and sampling_rate
|
|
||||||
is in us, so you get the same sysfs value by default. Sampling rate
|
|
||||||
should always get adjusted considering the transition latency to set
|
|
||||||
the sampling rate 750 times as high as the transition latency in the
|
|
||||||
bash (as said, 1000 is default), do:
|
|
||||||
|
|
||||||
$ echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
|
|
||||||
|
|
||||||
* sampling_rate_min:
|
|
||||||
|
|
||||||
The sampling rate is limited by the HW transition latency:
|
|
||||||
transition_latency * 100
|
|
||||||
|
|
||||||
Or by kernel restrictions:
|
|
||||||
- If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed.
|
|
||||||
- If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is
|
|
||||||
used, the limits depend on the CONFIG_HZ option:
|
|
||||||
HZ=1000: min=20000us (20ms)
|
|
||||||
HZ=250: min=80000us (80ms)
|
|
||||||
HZ=100: min=200000us (200ms)
|
|
||||||
|
|
||||||
The highest value of kernel and HW latency restrictions is shown and
|
|
||||||
used as the minimum sampling rate.
|
|
||||||
|
|
||||||
* up_threshold:
|
|
||||||
|
|
||||||
This defines what the average CPU usage between the samplings of
|
|
||||||
'sampling_rate' needs to be for the kernel to make a decision on
|
|
||||||
whether it should increase the frequency. For example when it is set
|
|
||||||
to its default value of '95' it means that between the checking
|
|
||||||
intervals the CPU needs to be on average more than 95% in use to then
|
|
||||||
decide that the CPU frequency needs to be increased.
|
|
||||||
|
|
||||||
* ignore_nice_load:
|
|
||||||
|
|
||||||
This parameter takes a value of '0' or '1'. When set to '0' (its
|
|
||||||
default), all processes are counted towards the 'cpu utilisation'
|
|
||||||
value. When set to '1', the processes that are run with a 'nice'
|
|
||||||
value will not count (and thus be ignored) in the overall usage
|
|
||||||
calculation. This is useful if you are running a CPU intensive
|
|
||||||
calculation on your laptop that you do not care how long it takes to
|
|
||||||
complete as you can 'nice' it and prevent it from taking part in the
|
|
||||||
deciding process of whether to increase your CPU frequency.
|
|
||||||
|
|
||||||
* sampling_down_factor:
|
|
||||||
|
|
||||||
This parameter controls the rate at which the kernel makes a decision
|
|
||||||
on when to decrease the frequency while running at top speed. When set
|
|
||||||
to 1 (the default) decisions to reevaluate load are made at the same
|
|
||||||
interval regardless of current clock speed. But when set to greater
|
|
||||||
than 1 (e.g. 100) it acts as a multiplier for the scheduling interval
|
|
||||||
for reevaluating load when the CPU is at its top speed due to high
|
|
||||||
load. This improves performance by reducing the overhead of load
|
|
||||||
evaluation and helping the CPU stay at its top speed when truly busy,
|
|
||||||
rather than shifting back and forth in speed. This tunable has no
|
|
||||||
effect on behavior at lower speeds/lower CPU loads.
|
|
||||||
|
|
||||||
* powersave_bias:
|
|
||||||
|
|
||||||
This parameter takes a value between 0 to 1000. It defines the
|
|
||||||
percentage (times 10) value of the target frequency that will be
|
|
||||||
shaved off of the target. For example, when set to 100 -- 10%, when
|
|
||||||
ondemand governor would have targeted 1000 MHz, it will target
|
|
||||||
1000 MHz - (10% of 1000 MHz) = 900 MHz instead. This is set to 0
|
|
||||||
(disabled) by default.
|
|
||||||
|
|
||||||
When AMD frequency sensitivity powersave bias driver --
|
|
||||||
drivers/cpufreq/amd_freq_sensitivity.c is loaded, this parameter
|
|
||||||
defines the workload frequency sensitivity threshold in which a lower
|
|
||||||
frequency is chosen instead of ondemand governor's original target.
|
|
||||||
The frequency sensitivity is a hardware reported (on AMD Family 16h
|
|
||||||
Processors and above) value between 0 to 100% that tells software how
|
|
||||||
the performance of the workload running on a CPU will change when
|
|
||||||
frequency changes. A workload with sensitivity of 0% (memory/IO-bound)
|
|
||||||
will not perform any better on higher core frequency, whereas a
|
|
||||||
workload with sensitivity of 100% (CPU-bound) will perform better
|
|
||||||
higher the frequency. When the driver is loaded, this is set to 400 by
|
|
||||||
default -- for CPUs running workloads with sensitivity value below
|
|
||||||
40%, a lower frequency is chosen. Unloading the driver or writing 0
|
|
||||||
will disable this feature.
|
|
||||||
|
|
||||||
|
|
||||||
2.5 Conservative
|
|
||||||
----------------
|
|
||||||
|
|
||||||
The CPUfreq governor "conservative", much like the "ondemand"
|
|
||||||
governor, sets the CPU frequency depending on the current usage. It
|
|
||||||
differs in behaviour in that it gracefully increases and decreases the
|
|
||||||
CPU speed rather than jumping to max speed the moment there is any load
|
|
||||||
on the CPU. This behaviour is more suitable in a battery powered
|
|
||||||
environment. The governor is tweaked in the same manner as the
|
|
||||||
"ondemand" governor through sysfs with the addition of:
|
|
||||||
|
|
||||||
* freq_step:
|
|
||||||
|
|
||||||
This describes what percentage steps the cpu freq should be increased
|
|
||||||
and decreased smoothly by. By default the cpu frequency will increase
|
|
||||||
in 5% chunks of your maximum cpu frequency. You can change this value
|
|
||||||
to anywhere between 0 and 100 where '0' will effectively lock your CPU
|
|
||||||
at a speed regardless of its load whilst '100' will, in theory, make
|
|
||||||
it behave identically to the "ondemand" governor.
|
|
||||||
|
|
||||||
* down_threshold:
|
|
||||||
|
|
||||||
Same as the 'up_threshold' found for the "ondemand" governor but for
|
|
||||||
the opposite direction. For example when set to its default value of
|
|
||||||
'20' it means that if the CPU usage needs to be below 20% between
|
|
||||||
samples to have the frequency decreased.
|
|
||||||
|
|
||||||
* sampling_down_factor:
|
|
||||||
|
|
||||||
Similar functionality as in "ondemand" governor. But in
|
|
||||||
"conservative", it controls the rate at which the kernel makes a
|
|
||||||
decision on when to decrease the frequency while running in any speed.
|
|
||||||
Load for frequency increase is still evaluated every sampling rate.
|
|
||||||
|
|
||||||
|
|
||||||
2.6 Schedutil
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The "schedutil" governor aims at better integration with the Linux
|
|
||||||
kernel scheduler. Load estimation is achieved through the scheduler's
|
|
||||||
Per-Entity Load Tracking (PELT) mechanism, which also provides
|
|
||||||
information about the recent load [1]. This governor currently does
|
|
||||||
load based DVFS only for tasks managed by CFS. RT and DL scheduler tasks
|
|
||||||
are always run at the highest frequency. Unlike all the other
|
|
||||||
governors, the code is located under the kernel/sched/ directory.
|
|
||||||
|
|
||||||
Sysfs files:
|
|
||||||
|
|
||||||
* rate_limit_us:
|
|
||||||
|
|
||||||
This contains a value in microseconds. The governor waits for
|
|
||||||
rate_limit_us time before reevaluating the load again, after it has
|
|
||||||
evaluated the load once.
|
|
||||||
|
|
||||||
For an in-depth comparison with the other governors refer to [2].
|
|
||||||
|
|
||||||
|
|
||||||
3. The Governor Interface in the CPUfreq Core
|
|
||||||
=============================================
|
|
||||||
|
|
||||||
A new governor must register itself with the CPUfreq core using
|
|
||||||
"cpufreq_register_governor". The struct cpufreq_governor, which has to
|
|
||||||
be passed to that function, must contain the following values:
|
|
||||||
|
|
||||||
governor->name - A unique name for this governor.
|
|
||||||
governor->owner - .THIS_MODULE for the governor module (if appropriate).
|
|
||||||
|
|
||||||
plus a set of hooks to the functions implementing the governor's logic.
|
|
||||||
|
|
||||||
The CPUfreq governor may call the CPU processor driver using one of
|
|
||||||
these two functions:
|
|
||||||
|
|
||||||
int cpufreq_driver_target(struct cpufreq_policy *policy,
|
|
||||||
unsigned int target_freq,
|
|
||||||
unsigned int relation);
|
|
||||||
|
|
||||||
int __cpufreq_driver_target(struct cpufreq_policy *policy,
|
|
||||||
unsigned int target_freq,
|
|
||||||
unsigned int relation);
|
|
||||||
|
|
||||||
target_freq must be within policy->min and policy->max, of course.
|
|
||||||
What's the difference between these two functions? When your governor is
|
|
||||||
in a direct code path of a call to governor callbacks, like
|
|
||||||
governor->start(), the policy->rwsem is still held in the cpufreq core,
|
|
||||||
and there's no need to lock it again (in fact, this would cause a
|
|
||||||
deadlock). So use __cpufreq_driver_target only in these cases. In all
|
|
||||||
other cases (for example, when there's a "daemonized" function that
|
|
||||||
wakes up every second), use cpufreq_driver_target to take policy->rwsem
|
|
||||||
before the command is passed to the cpufreq driver.
|
|
||||||
|
|
||||||
4. References
|
|
||||||
=============
|
|
||||||
|
|
||||||
[1] Per-entity load tracking: https://lwn.net/Articles/531853/
|
|
||||||
[2] Improvements in CPU frequency management: https://lwn.net/Articles/682391/
|
|
||||||
|
|
@@ -21,8 +21,6 @@ Documents in this directory:
|
|||||||
|
|
||||||
amd-powernow.txt - AMD powernow driver specific file.
|
amd-powernow.txt - AMD powernow driver specific file.
|
||||||
|
|
||||||
boost.txt - Frequency boosting support.
|
|
||||||
|
|
||||||
core.txt - General description of the CPUFreq core and
|
core.txt - General description of the CPUFreq core and
|
||||||
of CPUFreq notifiers.
|
of CPUFreq notifiers.
|
||||||
|
|
||||||
@@ -32,17 +30,12 @@ cpufreq-nforce2.txt - nVidia nForce2 platform specific file.
|
|||||||
|
|
||||||
cpufreq-stats.txt - General description of sysfs cpufreq stats.
|
cpufreq-stats.txt - General description of sysfs cpufreq stats.
|
||||||
|
|
||||||
governors.txt - What are cpufreq governors and how to
|
|
||||||
implement them?
|
|
||||||
|
|
||||||
index.txt - File index, Mailing list and Links (this document)
|
index.txt - File index, Mailing list and Links (this document)
|
||||||
|
|
||||||
intel-pstate.txt - Intel pstate cpufreq driver specific file.
|
intel-pstate.txt - Intel pstate cpufreq driver specific file.
|
||||||
|
|
||||||
pcc-cpufreq.txt - PCC cpufreq driver specific file.
|
pcc-cpufreq.txt - PCC cpufreq driver specific file.
|
||||||
|
|
||||||
user-guide.txt - User Guide to CPUFreq
|
|
||||||
|
|
||||||
|
|
||||||
Mailing List
|
Mailing List
|
||||||
------------
|
------------
|
||||||
|
@@ -1,281 +0,0 @@
|
|||||||
Intel P-State driver
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
This driver provides an interface to control the P-State selection for the
|
|
||||||
SandyBridge+ Intel processors.
|
|
||||||
|
|
||||||
The following document explains P-States:
|
|
||||||
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
|
||||||
As stated in the document, P-State doesn’t exactly mean a frequency. However, for
|
|
||||||
the sake of the relationship with cpufreq, P-State and frequency are used
|
|
||||||
interchangeably.
|
|
||||||
|
|
||||||
Understanding the cpufreq core governors and policies are important before
|
|
||||||
discussing more details about the Intel P-State driver. Based on what callbacks
|
|
||||||
a cpufreq driver provides to the cpufreq core, it can support two types of
|
|
||||||
drivers:
|
|
||||||
- with target_index() callback: In this mode, the drivers using cpufreq core
|
|
||||||
simply provide the minimum and maximum frequency limits and an additional
|
|
||||||
interface target_index() to set the current frequency. The cpufreq subsystem
|
|
||||||
has a number of scaling governors ("performance", "powersave", "ondemand",
|
|
||||||
etc.). Depending on which governor is in use, cpufreq core will call for
|
|
||||||
transitions to a specific frequency using target_index() callback.
|
|
||||||
- setpolicy() callback: In this mode, drivers do not provide target_index()
|
|
||||||
callback, so cpufreq core can't request a transition to a specific frequency.
|
|
||||||
The driver provides minimum and maximum frequency limits and callbacks to set a
|
|
||||||
policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
|
|
||||||
The cpufreq core can request the driver to operate in any of the two policies:
|
|
||||||
"performance" and "powersave". The driver decides which frequency to use based
|
|
||||||
on the above policy selection considering minimum and maximum frequency limits.
|
|
||||||
|
|
||||||
The Intel P-State driver falls under the latter category, which implements the
|
|
||||||
setpolicy() callback. This driver decides what P-State to use based on the
|
|
||||||
requested policy from the cpufreq core. If the processor is capable of
|
|
||||||
selecting its next P-State internally, then the driver will offload this
|
|
||||||
responsibility to the processor (aka HWP: Hardware P-States). If not, the
|
|
||||||
driver implements algorithms to select the next P-State.
|
|
||||||
|
|
||||||
Since these policies are implemented in the driver, they are not same as the
|
|
||||||
cpufreq scaling governors implementation, even if they have the same name in
|
|
||||||
the cpufreq sysfs (scaling_governors). For example the "performance" policy is
|
|
||||||
similar to cpufreq’s "performance" governor, but "powersave" is completely
|
|
||||||
different than the cpufreq "powersave" governor. The strategy here is similar
|
|
||||||
to cpufreq "ondemand", where the requested P-State is related to the system load.
|
|
||||||
|
|
||||||
Sysfs Interface
|
|
||||||
|
|
||||||
In addition to the frequency-controlling interfaces provided by the cpufreq
|
|
||||||
core, the driver provides its own sysfs files to control the P-State selection.
|
|
||||||
These files have been added to /sys/devices/system/cpu/intel_pstate/.
|
|
||||||
Any changes made to these files are applicable to all CPUs (even in a
|
|
||||||
multi-package system, Refer to later section on placing "Per-CPU limits").
|
|
||||||
|
|
||||||
max_perf_pct: Limits the maximum P-State that will be requested by
|
|
||||||
the driver. It states it as a percentage of the available performance. The
|
|
||||||
available (P-State) performance may be reduced by the no_turbo
|
|
||||||
setting described below.
|
|
||||||
|
|
||||||
min_perf_pct: Limits the minimum P-State that will be requested by
|
|
||||||
the driver. It states it as a percentage of the max (non-turbo)
|
|
||||||
performance level.
|
|
||||||
|
|
||||||
no_turbo: Limits the driver to selecting P-State below the turbo
|
|
||||||
frequency range.
|
|
||||||
|
|
||||||
turbo_pct: Displays the percentage of the total performance that
|
|
||||||
is supported by hardware that is in the turbo range. This number
|
|
||||||
is independent of whether turbo has been disabled or not.
|
|
||||||
|
|
||||||
num_pstates: Displays the number of P-States that are supported
|
|
||||||
by hardware. This number is independent of whether turbo has
|
|
||||||
been disabled or not.
|
|
||||||
|
|
||||||
For example, if a system has these parameters:
|
|
||||||
Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State)
|
|
||||||
Max non turbo ratio: 0x17
|
|
||||||
Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio)
|
|
||||||
|
|
||||||
Sysfs will show :
|
|
||||||
max_perf_pct:100, which corresponds to 1 core ratio
|
|
||||||
min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio
|
|
||||||
no_turbo:0, turbo is not disabled
|
|
||||||
num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1)
|
|
||||||
turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates
|
|
||||||
|
|
||||||
Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
|
||||||
Volume 3: System Programming Guide" to understand ratios.
|
|
||||||
|
|
||||||
There is one more sysfs attribute in /sys/devices/system/cpu/intel_pstate/
|
|
||||||
that can be used for controlling the operation mode of the driver:
|
|
||||||
|
|
||||||
status: Three settings are possible:
|
|
||||||
"off" - The driver is not in use at this time.
|
|
||||||
"active" - The driver works as a P-state governor (default).
|
|
||||||
"passive" - The driver works as a regular cpufreq one and collaborates
|
|
||||||
with the generic cpufreq governors (it sets P-states as
|
|
||||||
requested by those governors).
|
|
||||||
The current setting is returned by reads from this attribute. Writing one
|
|
||||||
of the above strings to it changes the operation mode as indicated by that
|
|
||||||
string, if possible. If HW-managed P-states (HWP) are enabled, it is not
|
|
||||||
possible to change the driver's operation mode and attempts to write to
|
|
||||||
this attribute will fail.
|
|
||||||
|
|
||||||
cpufreq sysfs for Intel P-State
|
|
||||||
|
|
||||||
Since this driver registers with cpufreq, cpufreq sysfs is also presented.
|
|
||||||
There are some important differences, which need to be considered.
|
|
||||||
|
|
||||||
scaling_cur_freq: This displays the real frequency which was used during
|
|
||||||
the last sample period instead of what is requested. Some other cpufreq driver,
|
|
||||||
like acpi-cpufreq, displays what is requested (Some changes are on the
|
|
||||||
way to fix this for acpi-cpufreq driver). The same is true for frequencies
|
|
||||||
displayed at /proc/cpuinfo.
|
|
||||||
|
|
||||||
scaling_governor: This displays current active policy. Since each CPU has a
|
|
||||||
cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this
|
|
||||||
is not possible with Intel P-States, as there is one common policy for all
|
|
||||||
CPUs. Here, the last requested policy will be applicable to all CPUs. It is
|
|
||||||
suggested that one use the cpupower utility to change policy to all CPUs at the
|
|
||||||
same time.
|
|
||||||
|
|
||||||
scaling_setspeed: This attribute can never be used with Intel P-State.
|
|
||||||
|
|
||||||
scaling_max_freq/scaling_min_freq: This interface can be used similarly to
|
|
||||||
the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies
|
|
||||||
are converted to nearest possible P-State, this is prone to rounding errors.
|
|
||||||
This method is not preferred to limit performance.
|
|
||||||
|
|
||||||
affected_cpus: Not used
|
|
||||||
related_cpus: Not used
|
|
||||||
|
|
||||||
For contemporary Intel processors, the frequency is controlled by the
|
|
||||||
processor itself and the P-State exposed to software is related to
|
|
||||||
performance levels. The idea that frequency can be set to a single
|
|
||||||
frequency is fictional for Intel Core processors. Even if the scaling
|
|
||||||
driver selects a single P-State, the actual frequency the processor
|
|
||||||
will run at is selected by the processor itself.
|
|
||||||
|
|
||||||
Per-CPU limits
|
|
||||||
|
|
||||||
The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
|
|
||||||
the intel_pstate driver to use per-CPU performance limits. When it is set,
|
|
||||||
the sysfs control interface described above is subject to limitations.
|
|
||||||
- The following controls are not available for both read and write
|
|
||||||
/sys/devices/system/cpu/intel_pstate/max_perf_pct
|
|
||||||
/sys/devices/system/cpu/intel_pstate/min_perf_pct
|
|
||||||
- The following controls can be used to set performance limits, as far as the
|
|
||||||
architecture of the processor permits:
|
|
||||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
|
|
||||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
|
|
||||||
/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
|
||||||
- User can still observe turbo percent and number of P-States from
|
|
||||||
/sys/devices/system/cpu/intel_pstate/turbo_pct
|
|
||||||
/sys/devices/system/cpu/intel_pstate/num_pstates
|
|
||||||
- User can read write system wide turbo status
|
|
||||||
/sys/devices/system/cpu/no_turbo
|
|
||||||
|
|
||||||
Support of energy performance hints
|
|
||||||
It is possible to provide hints to the HWP algorithms in the processor
|
|
||||||
to be more performance centric to more energy centric. When the driver
|
|
||||||
is using HWP, two additional cpufreq sysfs attributes are presented for
|
|
||||||
each logical CPU.
|
|
||||||
These attributes are:
|
|
||||||
- energy_performance_available_preferences
|
|
||||||
- energy_performance_preference
|
|
||||||
|
|
||||||
To get list of supported hints:
|
|
||||||
$ cat energy_performance_available_preferences
|
|
||||||
default performance balance_performance balance_power power
|
|
||||||
|
|
||||||
The current preference can be read or changed via cpufreq sysfs
|
|
||||||
attribute "energy_performance_preference". Reading from this attribute
|
|
||||||
will display current effective setting. User can write any of the valid
|
|
||||||
preference string to this attribute. User can always restore to power-on
|
|
||||||
default by writing "default".
|
|
||||||
|
|
||||||
Since threads can migrate to different CPUs, this is possible that the
|
|
||||||
new CPU may have different energy performance preference than the previous
|
|
||||||
one. To avoid such issues, either threads can be pinned to specific CPUs
|
|
||||||
or set the same energy performance preference value to all CPUs.
|
|
||||||
|
|
||||||
Tuning Intel P-State driver
|
|
||||||
|
|
||||||
When the performance can be tuned using PID (Proportional Integral
|
|
||||||
Derivative) controller, debugfs files are provided for adjusting performance.
|
|
||||||
They are presented under:
|
|
||||||
/sys/kernel/debug/pstate_snb/
|
|
||||||
|
|
||||||
The PID tunable parameters are:
|
|
||||||
deadband
|
|
||||||
d_gain_pct
|
|
||||||
i_gain_pct
|
|
||||||
p_gain_pct
|
|
||||||
sample_rate_ms
|
|
||||||
setpoint
|
|
||||||
|
|
||||||
To adjust these parameters, some understanding of driver implementation is
|
|
||||||
necessary. There are some tweeks described here, but be very careful. Adjusting
|
|
||||||
them requires expert level understanding of power and performance relationship.
|
|
||||||
These limits are only useful when the "powersave" policy is active.
|
|
||||||
|
|
||||||
-To make the system more responsive to load changes, sample_rate_ms can
|
|
||||||
be adjusted (current default is 10ms).
|
|
||||||
-To make the system use higher performance, even if the load is lower, setpoint
|
|
||||||
can be adjusted to a lower number. This will also lead to faster ramp up time
|
|
||||||
to reach the maximum P-State.
|
|
||||||
If there are no derivative and integral coefficients, The next P-State will be
|
|
||||||
equal to:
|
|
||||||
current P-State - ((setpoint - current cpu load) * p_gain_pct)
|
|
||||||
|
|
||||||
For example, if the current PID parameters are (Which are defaults for the core
|
|
||||||
processors like SandyBridge):
|
|
||||||
deadband = 0
|
|
||||||
d_gain_pct = 0
|
|
||||||
i_gain_pct = 0
|
|
||||||
p_gain_pct = 20
|
|
||||||
sample_rate_ms = 10
|
|
||||||
setpoint = 97
|
|
||||||
|
|
||||||
If the current P-State = 0x08 and current load = 100, this will result in the
|
|
||||||
next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State
|
|
||||||
goes up by only 1. If during next sample interval the current load doesn't
|
|
||||||
change and still 100, then P-State goes up by one again. This process will
|
|
||||||
continue as long as the load is more than the setpoint until the maximum P-State
|
|
||||||
is reached.
|
|
||||||
|
|
||||||
For the same load at setpoint = 60, this will result in the next P-State
|
|
||||||
= 0x08 - ((60 - 100) * 0.2) = 16
|
|
||||||
So by changing the setpoint from 97 to 60, there is an increase of the
|
|
||||||
next P-State from 9 to 16. So this will make processor execute at higher
|
|
||||||
P-State for the same CPU load. If the load continues to be more than the
|
|
||||||
setpoint during next sample intervals, then P-State will go up again till the
|
|
||||||
maximum P-State is reached. But the ramp up time to reach the maximum P-State
|
|
||||||
will be much faster when the setpoint is 60 compared to 97.
|
|
||||||
|
|
||||||
Debugging Intel P-State driver
|
|
||||||
|
|
||||||
Event tracing
|
|
||||||
To debug P-State transition, the Linux event tracing interface can be used.
|
|
||||||
There are two specific events, which can be enabled (Provided the kernel
|
|
||||||
configs related to event tracing are enabled).
|
|
||||||
|
|
||||||
# cd /sys/kernel/debug/tracing/
|
|
||||||
# echo 1 > events/power/pstate_sample/enable
|
|
||||||
# echo 1 > events/power/cpu_frequency/enable
|
|
||||||
# cat trace
|
|
||||||
gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107
|
|
||||||
scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618
|
|
||||||
freq=2474476
|
|
||||||
cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2
|
|
||||||
|
|
||||||
|
|
||||||
Using ftrace
|
|
||||||
|
|
||||||
If function level tracing is required, the Linux ftrace interface can be used.
|
|
||||||
For example if we want to check how often a function to set a P-State is
|
|
||||||
called, we can set ftrace filter to intel_pstate_set_pstate.
|
|
||||||
|
|
||||||
# cd /sys/kernel/debug/tracing/
|
|
||||||
# cat available_filter_functions | grep -i pstate
|
|
||||||
intel_pstate_set_pstate
|
|
||||||
intel_pstate_cpu_init
|
|
||||||
...
|
|
||||||
|
|
||||||
# echo intel_pstate_set_pstate > set_ftrace_filter
|
|
||||||
# echo function > current_tracer
|
|
||||||
# cat trace | head -15
|
|
||||||
# tracer: function
|
|
||||||
#
|
|
||||||
# entries-in-buffer/entries-written: 80/80 #P:4
|
|
||||||
#
|
|
||||||
# _-----=> irqs-off
|
|
||||||
# / _----=> need-resched
|
|
||||||
# | / _---=> hardirq/softirq
|
|
||||||
# || / _--=> preempt-depth
|
|
||||||
# ||| / delay
|
|
||||||
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
|
|
||||||
# | | | |||| | |
|
|
||||||
Xorg-3129 [000] ..s. 2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
|
||||||
gnome-terminal--4510 [002] ..s. 2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
|
|
||||||
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
|
|
||||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
|
@@ -1,228 +0,0 @@
|
|||||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
|
||||||
|
|
||||||
|
|
||||||
L i n u x C P U F r e q
|
|
||||||
|
|
||||||
U S E R G U I D E
|
|
||||||
|
|
||||||
|
|
||||||
Dominik Brodowski <linux@brodo.de>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
|
||||||
fly. This is a nice method to save battery power, because the lower
|
|
||||||
the clock speed, the less power the CPU consumes.
|
|
||||||
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
---------
|
|
||||||
1. Supported Architectures and Processors
|
|
||||||
1.1 ARM and ARM64
|
|
||||||
1.2 x86
|
|
||||||
1.3 sparc64
|
|
||||||
1.4 ppc
|
|
||||||
1.5 SuperH
|
|
||||||
1.6 Blackfin
|
|
||||||
|
|
||||||
2. "Policy" / "Governor"?
|
|
||||||
2.1 Policy
|
|
||||||
2.2 Governor
|
|
||||||
|
|
||||||
3. How to change the CPU cpufreq policy and/or speed
|
|
||||||
3.1 Preferred interface: sysfs
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1. Supported Architectures and Processors
|
|
||||||
=========================================
|
|
||||||
|
|
||||||
1.1 ARM and ARM64
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
Almost all ARM and ARM64 platforms support CPU frequency scaling.
|
|
||||||
|
|
||||||
1.2 x86
|
|
||||||
-------
|
|
||||||
|
|
||||||
The following processors for the x86 architecture are supported by cpufreq:
|
|
||||||
|
|
||||||
AMD Elan - SC400, SC410
|
|
||||||
AMD mobile K6-2+
|
|
||||||
AMD mobile K6-3+
|
|
||||||
AMD mobile Duron
|
|
||||||
AMD mobile Athlon
|
|
||||||
AMD Opteron
|
|
||||||
AMD Athlon 64
|
|
||||||
Cyrix Media GXm
|
|
||||||
Intel mobile PIII and Intel mobile PIII-M on certain chipsets
|
|
||||||
Intel Pentium 4, Intel Xeon
|
|
||||||
Intel Pentium M (Centrino)
|
|
||||||
National Semiconductors Geode GX
|
|
||||||
Transmeta Crusoe
|
|
||||||
Transmeta Efficeon
|
|
||||||
VIA Cyrix 3 / C3
|
|
||||||
various processors on some ACPI 2.0-compatible systems [*]
|
|
||||||
And many more
|
|
||||||
|
|
||||||
[*] Only if "ACPI Processor Performance States" are available
|
|
||||||
to the ACPI<->BIOS interface.
|
|
||||||
|
|
||||||
|
|
||||||
1.3 sparc64
|
|
||||||
-----------
|
|
||||||
|
|
||||||
The following processors for the sparc64 architecture are supported by
|
|
||||||
cpufreq:
|
|
||||||
|
|
||||||
UltraSPARC-III
|
|
||||||
|
|
||||||
|
|
||||||
1.4 ppc
|
|
||||||
-------
|
|
||||||
|
|
||||||
Several "PowerBook" and "iBook2" notebooks are supported.
|
|
||||||
The following POWER processors are supported in powernv mode:
|
|
||||||
POWER8
|
|
||||||
POWER9
|
|
||||||
|
|
||||||
1.5 SuperH
|
|
||||||
----------
|
|
||||||
|
|
||||||
All SuperH processors supporting rate rounding through the clock
|
|
||||||
framework are supported by cpufreq.
|
|
||||||
|
|
||||||
1.6 Blackfin
|
|
||||||
------------
|
|
||||||
|
|
||||||
The following Blackfin processors are supported by cpufreq:
|
|
||||||
|
|
||||||
BF522, BF523, BF524, BF525, BF526, BF527, Rev 0.1 or higher
|
|
||||||
BF531, BF532, BF533, Rev 0.3 or higher
|
|
||||||
BF534, BF536, BF537, Rev 0.2 or higher
|
|
||||||
BF561, Rev 0.3 or higher
|
|
||||||
BF542, BF544, BF547, BF548, BF549, Rev 0.1 or higher
|
|
||||||
|
|
||||||
|
|
||||||
2. "Policy" / "Governor" ?
|
|
||||||
==========================
|
|
||||||
|
|
||||||
Some CPU frequency scaling-capable processor switch between various
|
|
||||||
frequencies and operating voltages "on the fly" without any kernel or
|
|
||||||
user involvement. This guarantees very fast switching to a frequency
|
|
||||||
which is high enough to serve the user's needs, but low enough to save
|
|
||||||
power.
|
|
||||||
|
|
||||||
|
|
||||||
2.1 Policy
|
|
||||||
----------
|
|
||||||
|
|
||||||
On these systems, all you can do is select the lower and upper
|
|
||||||
frequency limit as well as whether you want more aggressive
|
|
||||||
power-saving or more instantly available processing power.
|
|
||||||
|
|
||||||
|
|
||||||
2.2 Governor
|
|
||||||
------------
|
|
||||||
|
|
||||||
On all other cpufreq implementations, these boundaries still need to
|
|
||||||
be set. Then, a "governor" must be selected. Such a "governor" decides
|
|
||||||
what speed the processor shall run within the boundaries. One such
|
|
||||||
"governor" is the "userspace" governor. This one allows the user - or
|
|
||||||
a yet-to-implement userspace program - to decide what specific speed
|
|
||||||
the processor shall run at.
|
|
||||||
|
|
||||||
|
|
||||||
3. How to change the CPU cpufreq policy and/or speed
|
|
||||||
====================================================
|
|
||||||
|
|
||||||
3.1 Preferred Interface: sysfs
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
The preferred interface is located in the sysfs filesystem. If you
|
|
||||||
mounted it at /sys, the cpufreq interface is located in a subdirectory
|
|
||||||
"cpufreq" within the cpu-device directory
|
|
||||||
(e.g. /sys/devices/system/cpu/cpu0/cpufreq/ for the first CPU).
|
|
||||||
|
|
||||||
affected_cpus : List of Online CPUs that require software
|
|
||||||
coordination of frequency.
|
|
||||||
|
|
||||||
cpuinfo_cur_freq : Current frequency of the CPU as obtained from
|
|
||||||
the hardware, in KHz. This is the frequency
|
|
||||||
the CPU actually runs at.
|
|
||||||
|
|
||||||
cpuinfo_min_freq : this file shows the minimum operating
|
|
||||||
frequency the processor can run at(in kHz)
|
|
||||||
|
|
||||||
cpuinfo_max_freq : this file shows the maximum operating
|
|
||||||
frequency the processor can run at(in kHz)
|
|
||||||
|
|
||||||
cpuinfo_transition_latency The time it takes on this CPU to
|
|
||||||
switch between two frequencies in nano
|
|
||||||
seconds. If unknown or known to be
|
|
||||||
that high that the driver does not
|
|
||||||
work with the ondemand governor, -1
|
|
||||||
(CPUFREQ_ETERNAL) will be returned.
|
|
||||||
Using this information can be useful
|
|
||||||
to choose an appropriate polling
|
|
||||||
frequency for a kernel governor or
|
|
||||||
userspace daemon. Make sure to not
|
|
||||||
switch the frequency too often
|
|
||||||
resulting in performance loss.
|
|
||||||
|
|
||||||
related_cpus : List of Online + Offline CPUs that need software
|
|
||||||
coordination of frequency.
|
|
||||||
|
|
||||||
scaling_available_frequencies : List of available frequencies, in KHz.
|
|
||||||
|
|
||||||
scaling_available_governors : this file shows the CPUfreq governors
|
|
||||||
available in this kernel. You can see the
|
|
||||||
currently activated governor in
|
|
||||||
|
|
||||||
scaling_cur_freq : Current frequency of the CPU as determined by
|
|
||||||
the governor and cpufreq core, in KHz. This is
|
|
||||||
the frequency the kernel thinks the CPU runs
|
|
||||||
at.
|
|
||||||
|
|
||||||
scaling_driver : this file shows what cpufreq driver is
|
|
||||||
used to set the frequency on this CPU
|
|
||||||
|
|
||||||
scaling_governor, and by "echoing" the name of another
|
|
||||||
governor you can change it. Please note
|
|
||||||
that some governors won't load - they only
|
|
||||||
work on some specific architectures or
|
|
||||||
processors.
|
|
||||||
|
|
||||||
scaling_min_freq and
|
|
||||||
scaling_max_freq show the current "policy limits" (in
|
|
||||||
kHz). By echoing new values into these
|
|
||||||
files, you can change these limits.
|
|
||||||
NOTE: when setting a policy you need to
|
|
||||||
first set scaling_max_freq, then
|
|
||||||
scaling_min_freq.
|
|
||||||
|
|
||||||
scaling_setspeed This can be read to get the currently programmed
|
|
||||||
value by the governor. This can be written to
|
|
||||||
change the current frequency for a group of
|
|
||||||
CPUs, represented by a policy. This is supported
|
|
||||||
currently only by the userspace governor.
|
|
||||||
|
|
||||||
bios_limit : If the BIOS tells the OS to limit a CPU to
|
|
||||||
lower frequencies, the user can read out the
|
|
||||||
maximum available frequency from this file.
|
|
||||||
This typically can happen through (often not
|
|
||||||
intended) BIOS settings, restrictions
|
|
||||||
triggered through a service processor or other
|
|
||||||
BIOS/HW based implementations.
|
|
||||||
This does not cover thermal ACPI limitations
|
|
||||||
which can be detected through the generic
|
|
||||||
thermal driver.
|
|
||||||
|
|
||||||
If you have selected the "userspace" governor which allows you to
|
|
||||||
set the CPU operating frequency to a specific value, you can read out
|
|
||||||
the current frequency in
|
|
||||||
|
|
||||||
scaling_setspeed. By "echoing" a new frequency into this
|
|
||||||
you can change the speed of the CPU,
|
|
||||||
but only within the limits of
|
|
||||||
scaling_min_freq and scaling_max_freq.
|
|
@@ -100,7 +100,7 @@ not defined by include/asm-XXX/topology.h:
|
|||||||
|
|
||||||
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
||||||
default definitions for topology_book_id() and topology_book_cpumask().
|
default definitions for topology_book_id() and topology_book_cpumask().
|
||||||
For architectures that don't support drawes (CONFIG_SCHED_DRAWER) there are
|
For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
|
||||||
no default definitions for topology_drawer_id() and topology_drawer_cpumask().
|
no default definitions for topology_drawer_id() and topology_drawer_cpumask().
|
||||||
|
|
||||||
Additionally, CPU topology information is provided under
|
Additionally, CPU topology information is provided under
|
||||||
|
@@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH
|
|||||||
char ctx[];
|
char ctx[];
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct sdescinit_sdesc(struct crypto_shash *alg)
|
static struct sdesc init_sdesc(struct crypto_shash *alg)
|
||||||
{
|
{
|
||||||
struct sdescsdesc;
|
struct sdesc sdesc;
|
||||||
int size;
|
int size;
|
||||||
|
|
||||||
size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
|
size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
|
||||||
@@ -172,7 +172,7 @@ Code Example For Use of Operational State Memory With SHASH
|
|||||||
static int calc_hash(struct crypto_shashalg,
|
static int calc_hash(struct crypto_shashalg,
|
||||||
const unsigned chardata, unsigned int datalen,
|
const unsigned chardata, unsigned int datalen,
|
||||||
unsigned chardigest) {
|
unsigned chardigest) {
|
||||||
struct sdescsdesc;
|
struct sdesc sdesc;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
sdesc = init_sdesc(alg);
|
sdesc = init_sdesc(alg);
|
||||||
|
@@ -311,3 +311,54 @@ Functions are provided to register and unregister parsers:
|
|||||||
|
|
||||||
Parsers may not have the same name. The names are otherwise only used for
|
Parsers may not have the same name. The names are otherwise only used for
|
||||||
displaying in debugging messages.
|
displaying in debugging messages.
|
||||||
|
|
||||||
|
|
||||||
|
=========================
|
||||||
|
KEYRING LINK RESTRICTIONS
|
||||||
|
=========================
|
||||||
|
|
||||||
|
Keyrings created from userspace using add_key can be configured to check the
|
||||||
|
signature of the key being linked.
|
||||||
|
|
||||||
|
Several restriction methods are available:
|
||||||
|
|
||||||
|
(1) Restrict using the kernel builtin trusted keyring
|
||||||
|
|
||||||
|
- Option string used with KEYCTL_RESTRICT_KEYRING:
|
||||||
|
- "builtin_trusted"
|
||||||
|
|
||||||
|
The kernel builtin trusted keyring will be searched for the signing
|
||||||
|
key. The ca_keys kernel parameter also affects which keys are used for
|
||||||
|
signature verification.
|
||||||
|
|
||||||
|
(2) Restrict using the kernel builtin and secondary trusted keyrings
|
||||||
|
|
||||||
|
- Option string used with KEYCTL_RESTRICT_KEYRING:
|
||||||
|
- "builtin_and_secondary_trusted"
|
||||||
|
|
||||||
|
The kernel builtin and secondary trusted keyrings will be searched for the
|
||||||
|
signing key. The ca_keys kernel parameter also affects which keys are used
|
||||||
|
for signature verification.
|
||||||
|
|
||||||
|
(3) Restrict using a separate key or keyring
|
||||||
|
|
||||||
|
- Option string used with KEYCTL_RESTRICT_KEYRING:
|
||||||
|
- "key_or_keyring:<key or keyring serial number>[:chain]"
|
||||||
|
|
||||||
|
Whenever a key link is requested, the link will only succeed if the key
|
||||||
|
being linked is signed by one of the designated keys. This key may be
|
||||||
|
specified directly by providing a serial number for one asymmetric key, or
|
||||||
|
a group of keys may be searched for the signing key by providing the
|
||||||
|
serial number for a keyring.
|
||||||
|
|
||||||
|
When the "chain" option is provided at the end of the string, the keys
|
||||||
|
within the destination keyring will also be searched for signing keys.
|
||||||
|
This allows for verification of certificate chains by adding each
|
||||||
|
cert in order (starting closest to the root) to one keyring.
|
||||||
|
|
||||||
|
In all of these cases, if the signing key is found the signature of the key to
|
||||||
|
be linked will be verified using the signing key. The requested key is added
|
||||||
|
to the keyring only if the signature is successfully verified. -ENOKEY is
|
||||||
|
returned if the parent certificate could not be found, or -EKEYREJECTED is
|
||||||
|
returned if the signature check fails or the key is blacklisted. Other errors
|
||||||
|
may be returned if the signature check could not be performed.
|
||||||
|
@@ -100,8 +100,8 @@ Step-by-step instructions for using firescope with early OHCI initialization:
|
|||||||
CardBus and even some Express cards which are fully compliant to OHCI-1394
|
CardBus and even some Express cards which are fully compliant to OHCI-1394
|
||||||
specification are available. If it requires no driver for Windows operating
|
specification are available. If it requires no driver for Windows operating
|
||||||
systems, it most likely is. Only specialized shops have cards which are not
|
systems, it most likely is. Only specialized shops have cards which are not
|
||||||
compliant, they are based on TI PCILynx chips and require drivers for Win-
|
compliant, they are based on TI PCILynx chips and require drivers for Windows
|
||||||
dows operating systems.
|
operating systems.
|
||||||
|
|
||||||
The mentioned kernel log message contains the string "physUB" if the
|
The mentioned kernel log message contains the string "physUB" if the
|
||||||
controller implements a writable Physical Upper Bound register. This is
|
controller implements a writable Physical Upper Bound register. This is
|
||||||
|
@@ -290,7 +290,7 @@ message, which takes an arbitrary number of cblock ranges. Each cblock
|
|||||||
range's end value is "one past the end", meaning 5-10 expresses a range
|
range's end value is "one past the end", meaning 5-10 expresses a range
|
||||||
of values from 5 to 9. Each cblock must be expressed as a decimal
|
of values from 5 to 9. Each cblock must be expressed as a decimal
|
||||||
value, in the future a variant message that takes cblock ranges
|
value, in the future a variant message that takes cblock ranges
|
||||||
expressed in hexidecimal may be needed to better support efficient
|
expressed in hexadecimal may be needed to better support efficient
|
||||||
invalidation of larger caches. The cache must be in passthrough mode
|
invalidation of larger caches. The cache must be in passthrough mode
|
||||||
when invalidate_cblocks is used.
|
when invalidate_cblocks is used.
|
||||||
|
|
||||||
|
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
|
|||||||
<offset> [<#opt_params> <opt_params>]
|
<offset> [<#opt_params> <opt_params>]
|
||||||
|
|
||||||
<cipher>
|
<cipher>
|
||||||
Encryption cipher and an optional IV generation mode.
|
Encryption cipher, encryption mode and Initial Vector (IV) generator.
|
||||||
(In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
|
|
||||||
Examples:
|
|
||||||
des
|
|
||||||
aes-cbc-essiv:sha256
|
|
||||||
twofish-ecb
|
|
||||||
|
|
||||||
/proc/crypto contains supported crypto modes
|
The cipher specifications format is:
|
||||||
|
cipher[:keycount]-chainmode-ivmode[:ivopts]
|
||||||
|
Examples:
|
||||||
|
aes-cbc-essiv:sha256
|
||||||
|
aes-xts-plain64
|
||||||
|
serpent-xts-plain64
|
||||||
|
|
||||||
|
Cipher format also supports direct specification with kernel crypt API
|
||||||
|
format (selected by capi: prefix). The IV specification is the same
|
||||||
|
as for the first format type.
|
||||||
|
This format is mainly used for specification of authenticated modes.
|
||||||
|
|
||||||
|
The crypto API cipher specifications format is:
|
||||||
|
capi:cipher_api_spec-ivmode[:ivopts]
|
||||||
|
Examples:
|
||||||
|
capi:cbc(aes)-essiv:sha256
|
||||||
|
capi:xts(aes)-plain64
|
||||||
|
Examples of authenticated modes:
|
||||||
|
capi:gcm(aes)-random
|
||||||
|
capi:authenc(hmac(sha256),xts(aes))-random
|
||||||
|
capi:rfc7539(chacha20,poly1305)-random
|
||||||
|
|
||||||
|
The /proc/crypto contains a list of curently loaded crypto modes.
|
||||||
|
|
||||||
<key>
|
<key>
|
||||||
Key used for encryption. It is encoded either as a hexadecimal number
|
Key used for encryption. It is encoded either as a hexadecimal number
|
||||||
@@ -93,6 +110,32 @@ submit_from_crypt_cpus
|
|||||||
thread because it benefits CFQ to have writes submitted using the
|
thread because it benefits CFQ to have writes submitted using the
|
||||||
same context.
|
same context.
|
||||||
|
|
||||||
|
integrity:<bytes>:<type>
|
||||||
|
The device requires additional <bytes> metadata per-sector stored
|
||||||
|
in per-bio integrity structure. This metadata must by provided
|
||||||
|
by underlying dm-integrity target.
|
||||||
|
|
||||||
|
The <type> can be "none" if metadata is used only for persistent IV.
|
||||||
|
|
||||||
|
For Authenticated Encryption with Additional Data (AEAD)
|
||||||
|
the <type> is "aead". An AEAD mode additionally calculates and verifies
|
||||||
|
integrity for the encrypted device. The additional space is then
|
||||||
|
used for storing authentication tag (and persistent IV if needed).
|
||||||
|
|
||||||
|
sector_size:<bytes>
|
||||||
|
Use <bytes> as the encryption unit instead of 512 bytes sectors.
|
||||||
|
This option can be in range 512 - 4096 bytes and must be power of two.
|
||||||
|
Virtual device will announce this size as a minimal IO and logical sector.
|
||||||
|
|
||||||
|
iv_large_sectors
|
||||||
|
IV generators will use sector number counted in <sector_size> units
|
||||||
|
instead of default 512 bytes sectors.
|
||||||
|
|
||||||
|
For example, if <sector_size> is 4096 bytes, plain64 IV for the second
|
||||||
|
sector will be 8 (without flag) and 1 if iv_large_sectors is present.
|
||||||
|
The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
|
||||||
|
if this flag is specified.
|
||||||
|
|
||||||
Example scripts
|
Example scripts
|
||||||
===============
|
===============
|
||||||
LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
|
LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
|
||||||
|
199
Documentation/device-mapper/dm-integrity.txt
Normal file
199
Documentation/device-mapper/dm-integrity.txt
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
The dm-integrity target emulates a block device that has additional
|
||||||
|
per-sector tags that can be used for storing integrity information.
|
||||||
|
|
||||||
|
A general problem with storing integrity tags with every sector is that
|
||||||
|
writing the sector and the integrity tag must be atomic - i.e. in case of
|
||||||
|
crash, either both sector and integrity tag or none of them is written.
|
||||||
|
|
||||||
|
To guarantee write atomicity, the dm-integrity target uses journal, it
|
||||||
|
writes sector data and integrity tags into a journal, commits the journal
|
||||||
|
and then copies the data and integrity tags to their respective location.
|
||||||
|
|
||||||
|
The dm-integrity target can be used with the dm-crypt target - in this
|
||||||
|
situation the dm-crypt target creates the integrity data and passes them
|
||||||
|
to the dm-integrity target via bio_integrity_payload attached to the bio.
|
||||||
|
In this mode, the dm-crypt and dm-integrity targets provide authenticated
|
||||||
|
disk encryption - if the attacker modifies the encrypted device, an I/O
|
||||||
|
error is returned instead of random data.
|
||||||
|
|
||||||
|
The dm-integrity target can also be used as a standalone target, in this
|
||||||
|
mode it calculates and verifies the integrity tag internally. In this
|
||||||
|
mode, the dm-integrity target can be used to detect silent data
|
||||||
|
corruption on the disk or in the I/O path.
|
||||||
|
|
||||||
|
|
||||||
|
When loading the target for the first time, the kernel driver will format
|
||||||
|
the device. But it will only format the device if the superblock contains
|
||||||
|
zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
|
||||||
|
target can't be loaded.
|
||||||
|
|
||||||
|
To use the target for the first time:
|
||||||
|
1. overwrite the superblock with zeroes
|
||||||
|
2. load the dm-integrity target with one-sector size, the kernel driver
|
||||||
|
will format the device
|
||||||
|
3. unload the dm-integrity target
|
||||||
|
4. read the "provided_data_sectors" value from the superblock
|
||||||
|
5. load the dm-integrity target with the the target size
|
||||||
|
"provided_data_sectors"
|
||||||
|
6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
|
||||||
|
with the size "provided_data_sectors"
|
||||||
|
|
||||||
|
|
||||||
|
Target arguments:
|
||||||
|
|
||||||
|
1. the underlying block device
|
||||||
|
|
||||||
|
2. the number of reserved sector at the beginning of the device - the
|
||||||
|
dm-integrity won't read of write these sectors
|
||||||
|
|
||||||
|
3. the size of the integrity tag (if "-" is used, the size is taken from
|
||||||
|
the internal-hash algorithm)
|
||||||
|
|
||||||
|
4. mode:
|
||||||
|
D - direct writes (without journal) - in this mode, journaling is
|
||||||
|
not used and data sectors and integrity tags are written
|
||||||
|
separately. In case of crash, it is possible that the data
|
||||||
|
and integrity tag doesn't match.
|
||||||
|
J - journaled writes - data and integrity tags are written to the
|
||||||
|
journal and atomicity is guaranteed. In case of crash,
|
||||||
|
either both data and tag or none of them are written. The
|
||||||
|
journaled mode degrades write throughput twice because the
|
||||||
|
data have to be written twice.
|
||||||
|
R - recovery mode - in this mode, journal is not replayed,
|
||||||
|
checksums are not checked and writes to the device are not
|
||||||
|
allowed. This mode is useful for data recovery if the
|
||||||
|
device cannot be activated in any of the other standard
|
||||||
|
modes.
|
||||||
|
|
||||||
|
5. the number of additional arguments
|
||||||
|
|
||||||
|
Additional arguments:
|
||||||
|
|
||||||
|
journal_sectors:number
|
||||||
|
The size of journal, this argument is used only if formatting the
|
||||||
|
device. If the device is already formatted, the value from the
|
||||||
|
superblock is used.
|
||||||
|
|
||||||
|
interleave_sectors:number
|
||||||
|
The number of interleaved sectors. This values is rounded down to
|
||||||
|
a power of two. If the device is already formatted, the value from
|
||||||
|
the superblock is used.
|
||||||
|
|
||||||
|
buffer_sectors:number
|
||||||
|
The number of sectors in one buffer. The value is rounded down to
|
||||||
|
a power of two.
|
||||||
|
|
||||||
|
The tag area is accessed using buffers, the buffer size is
|
||||||
|
configurable. The large buffer size means that the I/O size will
|
||||||
|
be larger, but there could be less I/Os issued.
|
||||||
|
|
||||||
|
journal_watermark:number
|
||||||
|
The journal watermark in percents. When the size of the journal
|
||||||
|
exceeds this watermark, the thread that flushes the journal will
|
||||||
|
be started.
|
||||||
|
|
||||||
|
commit_time:number
|
||||||
|
Commit time in milliseconds. When this time passes, the journal is
|
||||||
|
written. The journal is also written immediatelly if the FLUSH
|
||||||
|
request is received.
|
||||||
|
|
||||||
|
internal_hash:algorithm(:key) (the key is optional)
|
||||||
|
Use internal hash or crc.
|
||||||
|
When this argument is used, the dm-integrity target won't accept
|
||||||
|
integrity tags from the upper target, but it will automatically
|
||||||
|
generate and verify the integrity tags.
|
||||||
|
|
||||||
|
You can use a crc algorithm (such as crc32), then integrity target
|
||||||
|
will protect the data against accidental corruption.
|
||||||
|
You can also use a hmac algorithm (for example
|
||||||
|
"hmac(sha256):0123456789abcdef"), in this mode it will provide
|
||||||
|
cryptographic authentication of the data without encryption.
|
||||||
|
|
||||||
|
When this argument is not used, the integrity tags are accepted
|
||||||
|
from an upper layer target, such as dm-crypt. The upper layer
|
||||||
|
target should check the validity of the integrity tags.
|
||||||
|
|
||||||
|
journal_crypt:algorithm(:key) (the key is optional)
|
||||||
|
Encrypt the journal using given algorithm to make sure that the
|
||||||
|
attacker can't read the journal. You can use a block cipher here
|
||||||
|
(such as "cbc(aes)") or a stream cipher (for example "chacha20",
|
||||||
|
"salsa20", "ctr(aes)" or "ecb(arc4)").
|
||||||
|
|
||||||
|
The journal contains history of last writes to the block device,
|
||||||
|
an attacker reading the journal could see the last sector nubmers
|
||||||
|
that were written. From the sector numbers, the attacker can infer
|
||||||
|
the size of files that were written. To protect against this
|
||||||
|
situation, you can encrypt the journal.
|
||||||
|
|
||||||
|
journal_mac:algorithm(:key) (the key is optional)
|
||||||
|
Protect sector numbers in the journal from accidental or malicious
|
||||||
|
modification. To protect against accidental modification, use a
|
||||||
|
crc algorithm, to protect against malicious modification, use a
|
||||||
|
hmac algorithm with a key.
|
||||||
|
|
||||||
|
This option is not needed when using internal-hash because in this
|
||||||
|
mode, the integrity of journal entries is checked when replaying
|
||||||
|
the journal. Thus, modified sector number would be detected at
|
||||||
|
this stage.
|
||||||
|
|
||||||
|
block_size:number
|
||||||
|
The size of a data block in bytes. The larger the block size the
|
||||||
|
less overhead there is for per-block integrity metadata.
|
||||||
|
Supported values are 512, 1024, 2048 and 4096 bytes. If not
|
||||||
|
specified the default block size is 512 bytes.
|
||||||
|
|
||||||
|
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
|
||||||
|
be changed when reloading the target (load an inactive table and swap the
|
||||||
|
tables with suspend and resume). The other arguments should not be changed
|
||||||
|
when reloading the target because the layout of disk data depend on them
|
||||||
|
and the reloaded target would be non-functional.
|
||||||
|
|
||||||
|
|
||||||
|
The layout of the formatted block device:
|
||||||
|
* reserved sectors (they are not used by this target, they can be used for
|
||||||
|
storing LUKS metadata or for other purpose), the size of the reserved
|
||||||
|
area is specified in the target arguments
|
||||||
|
* superblock (4kiB)
|
||||||
|
* magic string - identifies that the device was formatted
|
||||||
|
* version
|
||||||
|
* log2(interleave sectors)
|
||||||
|
* integrity tag size
|
||||||
|
* the number of journal sections
|
||||||
|
* provided data sectors - the number of sectors that this target
|
||||||
|
provides (i.e. the size of the device minus the size of all
|
||||||
|
metadata and padding). The user of this target should not send
|
||||||
|
bios that access data beyond the "provided data sectors" limit.
|
||||||
|
* flags - a flag is set if journal_mac is used
|
||||||
|
* journal
|
||||||
|
The journal is divided into sections, each section contains:
|
||||||
|
* metadata area (4kiB), it contains journal entries
|
||||||
|
every journal entry contains:
|
||||||
|
* logical sector (specifies where the data and tag should
|
||||||
|
be written)
|
||||||
|
* last 8 bytes of data
|
||||||
|
* integrity tag (the size is specified in the superblock)
|
||||||
|
every metadata sector ends with
|
||||||
|
* mac (8-bytes), all the macs in 8 metadata sectors form a
|
||||||
|
64-byte value. It is used to store hmac of sector
|
||||||
|
numbers in the journal section, to protect against a
|
||||||
|
possibility that the attacker tampers with sector
|
||||||
|
numbers in the journal.
|
||||||
|
* commit id
|
||||||
|
* data area (the size is variable; it depends on how many journal
|
||||||
|
entries fit into the metadata area)
|
||||||
|
every sector in the data area contains:
|
||||||
|
* data (504 bytes of data, the last 8 bytes are stored in
|
||||||
|
the journal entry)
|
||||||
|
* commit id
|
||||||
|
To test if the whole journal section was written correctly, every
|
||||||
|
512-byte sector of the journal ends with 8-byte commit id. If the
|
||||||
|
commit id matches on all sectors in a journal section, then it is
|
||||||
|
assumed that the section was written correctly. If the commit id
|
||||||
|
doesn't match, the section was written partially and it should not
|
||||||
|
be replayed.
|
||||||
|
* one or more runs of interleaved tags and data. Each run contains:
|
||||||
|
* tag area - it contains integrity tags. There is one tag for each
|
||||||
|
sector in the data area
|
||||||
|
* data area - it contains data sectors. The number of data sectors
|
||||||
|
in one run must be a power of two. log2 of this value is stored
|
||||||
|
in the superblock.
|
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
|
|||||||
Takeover/reshape is not possible with a raid4/5/6 journal device;
|
Takeover/reshape is not possible with a raid4/5/6 journal device;
|
||||||
it has to be deconfigured before requesting these.
|
it has to be deconfigured before requesting these.
|
||||||
|
|
||||||
|
[journal_mode <mode>]
|
||||||
|
This option sets the caching mode on journaled raid4/5/6 raid sets
|
||||||
|
(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
|
||||||
|
If 'writeback' is selected the journal device has to be resilient
|
||||||
|
and must not suffer from the 'write hole' problem itself (e.g. use
|
||||||
|
raid1 or raid10) to avoid a single point of failure.
|
||||||
|
|
||||||
<#raid_devs>: The number of devices composing the array.
|
<#raid_devs>: The number of devices composing the array.
|
||||||
Each device consists of two entries. The first is the device
|
Each device consists of two entries. The first is the device
|
||||||
containing the metadata (if any); the second is the one containing the
|
containing the metadata (if any); the second is the one containing the
|
||||||
@@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields:
|
|||||||
<data_offset> The current data offset to the start of the user data on
|
<data_offset> The current data offset to the start of the user data on
|
||||||
each component device of a raid set (see the respective
|
each component device of a raid set (see the respective
|
||||||
raid parameter to support out-of-place reshaping).
|
raid parameter to support out-of-place reshaping).
|
||||||
<journal_char> 'A' - active raid4/5/6 journal device.
|
<journal_char> 'A' - active write-through journal device.
|
||||||
|
'a' - active write-back journal device.
|
||||||
'D' - dead journal device.
|
'D' - dead journal device.
|
||||||
'-' - no journal device.
|
'-' - no journal device.
|
||||||
|
|
||||||
@@ -331,3 +339,7 @@ Version History
|
|||||||
'D' on the status line. If '- -' is passed into the constructor, emit
|
'D' on the status line. If '- -' is passed into the constructor, emit
|
||||||
'- -' on the table line and '-' as the status line health character.
|
'- -' on the table line and '-' as the status line health character.
|
||||||
1.10.0 Add support for raid4/5/6 journal device
|
1.10.0 Add support for raid4/5/6 journal device
|
||||||
|
1.10.1 Fix data corruption on reshape request
|
||||||
|
1.11.0 Fix table line argument order
|
||||||
|
(wrong raid10_copies/raid10_format sequence)
|
||||||
|
1.11.1 Add raid4/5/6 journal write-back support via journal_mode option
|
||||||
|
@@ -43,8 +43,11 @@ Board compatible values:
|
|||||||
- "wetek,hub" (Meson gxbb)
|
- "wetek,hub" (Meson gxbb)
|
||||||
- "wetek,play2" (Meson gxbb)
|
- "wetek,play2" (Meson gxbb)
|
||||||
- "amlogic,p212" (Meson gxl s905x)
|
- "amlogic,p212" (Meson gxl s905x)
|
||||||
|
- "khadas,vim" (Meson gxl s905x)
|
||||||
|
|
||||||
- "amlogic,p230" (Meson gxl s905d)
|
- "amlogic,p230" (Meson gxl s905d)
|
||||||
- "amlogic,p231" (Meson gxl s905d)
|
- "amlogic,p231" (Meson gxl s905d)
|
||||||
|
- "hwacom,amazetv" (Meson gxl s905x)
|
||||||
- "amlogic,q200" (Meson gxm s912)
|
- "amlogic,q200" (Meson gxm s912)
|
||||||
- "amlogic,q201" (Meson gxm s912)
|
- "amlogic,q201" (Meson gxm s912)
|
||||||
- "nexbox,a95x" (Meson gxbb or Meson gxl s905x)
|
- "nexbox,a95x" (Meson gxbb or Meson gxl s905x)
|
||||||
|
@@ -217,7 +217,8 @@ memory, bridge implementations, processor and other functionality not controlled
|
|||||||
elsewhere.
|
elsewhere.
|
||||||
|
|
||||||
required properties:
|
required properties:
|
||||||
- compatible: Should be "atmel,<chip>-sfr", "syscon".
|
- compatible: Should be "atmel,<chip>-sfr", "syscon" or
|
||||||
|
"atmel,<chip>-sfrbu", "syscon"
|
||||||
<chip> can be "sama5d3", "sama5d4" or "sama5d2".
|
<chip> can be "sama5d3", "sama5d4" or "sama5d2".
|
||||||
- reg: Should contain registers location and length
|
- reg: Should contain registers location and length
|
||||||
|
|
||||||
|
@@ -0,0 +1,8 @@
|
|||||||
|
Cavium ThunderX2 CN99XX platform tree bindings
|
||||||
|
----------------------------------------------
|
||||||
|
|
||||||
|
Boards with Cavium ThunderX2 CN99XX SoC shall have the root property:
|
||||||
|
compatible = "cavium,thunderx2-cn9900", "brcm,vulcan-soc";
|
||||||
|
|
||||||
|
These SoC uses the "cavium,thunder2" core which will be compatible
|
||||||
|
with "brcm,vulcan".
|
@@ -170,6 +170,7 @@ nodes to be present and contain the properties described below.
|
|||||||
"brcm,brahma-b15"
|
"brcm,brahma-b15"
|
||||||
"brcm,vulcan"
|
"brcm,vulcan"
|
||||||
"cavium,thunder"
|
"cavium,thunder"
|
||||||
|
"cavium,thunder2"
|
||||||
"faraday,fa526"
|
"faraday,fa526"
|
||||||
"intel,sa110"
|
"intel,sa110"
|
||||||
"intel,sa1100"
|
"intel,sa1100"
|
||||||
|
@@ -0,0 +1,31 @@
|
|||||||
|
OP-TEE Device Tree Bindings
|
||||||
|
|
||||||
|
OP-TEE is a piece of software using hardware features to provide a Trusted
|
||||||
|
Execution Environment. The security can be provided with ARM TrustZone, but
|
||||||
|
also by virtualization or a separate chip.
|
||||||
|
|
||||||
|
We're using "linaro" as the first part of the compatible property for
|
||||||
|
the reference implementation maintained by Linaro.
|
||||||
|
|
||||||
|
* OP-TEE based on ARM TrustZone required properties:
|
||||||
|
|
||||||
|
- compatible : should contain "linaro,optee-tz"
|
||||||
|
|
||||||
|
- method : The method of calling the OP-TEE Trusted OS. Permitted
|
||||||
|
values are:
|
||||||
|
|
||||||
|
"smc" : SMC #0, with the register assignments specified
|
||||||
|
in drivers/tee/optee/optee_smc.h
|
||||||
|
|
||||||
|
"hvc" : HVC #0, with the register assignments specified
|
||||||
|
in drivers/tee/optee/optee_smc.h
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Example:
|
||||||
|
firmware {
|
||||||
|
optee {
|
||||||
|
compatible = "linaro,optee-tz";
|
||||||
|
method = "smc";
|
||||||
|
};
|
||||||
|
};
|
@@ -179,6 +179,18 @@ LS1046A ARMv8 based RDB Board
|
|||||||
Required root node properties:
|
Required root node properties:
|
||||||
- compatible = "fsl,ls1046a-rdb", "fsl,ls1046a";
|
- compatible = "fsl,ls1046a-rdb", "fsl,ls1046a";
|
||||||
|
|
||||||
|
LS1088A SoC
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls1088a";
|
||||||
|
|
||||||
|
LS1088A ARMv8 based QDS Board
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls1088a-qds", "fsl,ls1088a";
|
||||||
|
|
||||||
|
LS1088A ARMv8 based RDB Board
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls1088a-rdb", "fsl,ls1088a";
|
||||||
|
|
||||||
LS2080A SoC
|
LS2080A SoC
|
||||||
Required root node properties:
|
Required root node properties:
|
||||||
- compatible = "fsl,ls2080a";
|
- compatible = "fsl,ls2080a";
|
||||||
@@ -195,3 +207,14 @@ LS2080A ARMv8 based RDB Board
|
|||||||
Required root node properties:
|
Required root node properties:
|
||||||
- compatible = "fsl,ls2080a-rdb", "fsl,ls2080a";
|
- compatible = "fsl,ls2080a-rdb", "fsl,ls2080a";
|
||||||
|
|
||||||
|
LS2088A SoC
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls2088a";
|
||||||
|
|
||||||
|
LS2088A ARMv8 based QDS Board
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls2088a-qds", "fsl,ls2088a";
|
||||||
|
|
||||||
|
LS2088A ARMv8 based RDB Board
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "fsl,ls2088a-rdb", "fsl,ls2088a";
|
||||||
|
86
Documentation/devicetree/bindings/arm/gemini.txt
Normal file
86
Documentation/devicetree/bindings/arm/gemini.txt
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
Cortina systems Gemini platforms
|
||||||
|
|
||||||
|
The Gemini SoC is the project name for an ARMv4 FA525-based SoC originally
|
||||||
|
produced by Storlink Semiconductor around 2005. The company was renamed
|
||||||
|
later renamed Storm Semiconductor. The chip product name is Storlink SL3516.
|
||||||
|
It was derived from earlier products from Storm named SL3316 (Centroid) and
|
||||||
|
SL3512 (Bulverde).
|
||||||
|
|
||||||
|
Storm Semiconductor was acquired by Cortina Systems in 2008 and the SoC was
|
||||||
|
produced and used for NAS and similar usecases. In 2014 Cortina Systems was
|
||||||
|
in turn acquired by Inphi, who seem to have discontinued this product family.
|
||||||
|
|
||||||
|
Many of the IP blocks used in the SoC comes from Faraday Technology.
|
||||||
|
|
||||||
|
Required properties (in root node):
|
||||||
|
compatible = "cortina,gemini";
|
||||||
|
|
||||||
|
Required nodes:
|
||||||
|
|
||||||
|
- soc: the SoC should be represented by a simple bus encompassing all the
|
||||||
|
onchip devices, this is referred to as the soc bus node.
|
||||||
|
|
||||||
|
- syscon: the soc bus node must have a system controller node pointing to the
|
||||||
|
global control registers, with the compatible string
|
||||||
|
"cortina,gemini-syscon", "syscon";
|
||||||
|
|
||||||
|
- timer: the soc bus node must have a timer node pointing to the SoC timer
|
||||||
|
block, with the compatible string "cortina,gemini-timer"
|
||||||
|
See: clocksource/cortina,gemini-timer.txt
|
||||||
|
|
||||||
|
- interrupt-controller: the sob bus node must have an interrupt controller
|
||||||
|
node pointing to the SoC interrupt controller block, with the compatible
|
||||||
|
string "cortina,gemini-interrupt-controller"
|
||||||
|
See interrupt-controller/cortina,gemini-interrupt-controller.txt
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
/ {
|
||||||
|
model = "Foo Gemini Machine";
|
||||||
|
compatible = "cortina,gemini";
|
||||||
|
#address-cells = <1>;
|
||||||
|
#size-cells = <1>;
|
||||||
|
|
||||||
|
memory {
|
||||||
|
device_type = "memory";
|
||||||
|
reg = <0x00000000 0x8000000>;
|
||||||
|
};
|
||||||
|
|
||||||
|
soc {
|
||||||
|
#address-cells = <1>;
|
||||||
|
#size-cells = <1>;
|
||||||
|
ranges;
|
||||||
|
compatible = "simple-bus";
|
||||||
|
interrupt-parent = <&intcon>;
|
||||||
|
|
||||||
|
syscon: syscon@40000000 {
|
||||||
|
compatible = "cortina,gemini-syscon", "syscon";
|
||||||
|
reg = <0x40000000 0x1000>;
|
||||||
|
};
|
||||||
|
|
||||||
|
uart0: serial@42000000 {
|
||||||
|
compatible = "ns16550a";
|
||||||
|
reg = <0x42000000 0x100>;
|
||||||
|
clock-frequency = <48000000>;
|
||||||
|
interrupts = <18 IRQ_TYPE_LEVEL_HIGH>;
|
||||||
|
reg-shift = <2>;
|
||||||
|
};
|
||||||
|
|
||||||
|
timer@43000000 {
|
||||||
|
compatible = "cortina,gemini-timer";
|
||||||
|
reg = <0x43000000 0x1000>;
|
||||||
|
interrupt-parent = <&intcon>;
|
||||||
|
interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */
|
||||||
|
<15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */
|
||||||
|
<16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */
|
||||||
|
syscon = <&syscon>;
|
||||||
|
};
|
||||||
|
|
||||||
|
intcon: interrupt-controller@48000000 {
|
||||||
|
compatible = "cortina,gemini-interrupt-controller";
|
||||||
|
reg = <0x48000000 0x1000>;
|
||||||
|
interrupt-controller;
|
||||||
|
#interrupt-cells = <2>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
@@ -4,6 +4,14 @@ Hi3660 SoC
|
|||||||
Required root node properties:
|
Required root node properties:
|
||||||
- compatible = "hisilicon,hi3660";
|
- compatible = "hisilicon,hi3660";
|
||||||
|
|
||||||
|
Hi3798cv200 SoC
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "hisilicon,hi3798cv200";
|
||||||
|
|
||||||
|
Hi3798cv200 Poplar Board
|
||||||
|
Required root node properties:
|
||||||
|
- compatible = "hisilicon,hi3798cv200-poplar", "hisilicon,hi3798cv200";
|
||||||
|
|
||||||
Hi4511 Board
|
Hi4511 Board
|
||||||
Required root node properties:
|
Required root node properties:
|
||||||
- compatible = "hisilicon,hi3620-hi4511";
|
- compatible = "hisilicon,hi3620-hi4511";
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user