Merge branch 'tbsdtv_linux_media/master' into tbsdtv_linux_media/latest

Conflicts: drivers/media/dvb-core/dvb_ca_en50221.c drivers/media/usb/cx231xx/Kconfig drivers/media/usb/cx231xx/cx231xx-cards.c drivers/media/usb/cx231xx/cx231xx-dvb.c drivers/media/usb/cx231xx/cx231xx.h
2025-07-23 12:43:29 +02:00 · 2017-07-12 03:25:36 +03:00
parent 31df469c9e 2748e76ddb
commit 597f4b6034
12594 changed files with 1352893 additions and 290008 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,7 @@
 *.lzo
 *.patch
 *.gcno
 *.ll
 modules.builtin
 Module.symvers
 *.dwo
--- a/.mailmap
+++ b/.mailmap
@@ -99,6 +99,8 @@ Linas Vepstas <linas@austin.ibm.com>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Mark Brown <broonie@sirena.org.uk>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
 Matthieu CASTET <castet.matthieu@free.fr>
 Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
 Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
@@ -109,6 +111,7 @@ Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@osg.samsung.com>
 Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@s-opensource.com>
 Matt Ranostay <mranostay@gmail.com> Matthew Ranostay <mranostay@embeddedalley.com>
 Matt Ranostay <mranostay@gmail.com> <matt.ranostay@intel.com>
 Matt Ranostay <matt.ranostay@konsulko.com> <matt@ranostay.consulting>
 Mayuresh Janorkar <mayur@ti.com>
 Michael Buesch <m@bues.ch>
 Michel Dänzer <michel@tungstengraphics.com>
@@ -143,6 +146,8 @@ Santosh Shilimkar <ssantosh@kernel.org>
 Santosh Shilimkar <santosh.shilimkar@oracle.org>
 Sascha Hauer <s.hauer@pengutronix.de>
 S.Çağlar Onur <caglar@pardus.org.tr>
 Sebastian Reichel <sre@kernel.org> <sre@debian.org>
 Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
 Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
 Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
 Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
@@ -171,6 +176,7 @@ Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
 Takashi YOSHII <takashi.yoshii.zj@renesas.com>
 Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
 Gustavo Padovan <padovan@profusion.mobi>
--- a/8
+++ b/8
@@ -1034,6 +1034,10 @@ S: 2037 Walnut #6
 S: Boulder, Colorado 80302
 S: USA
 N: Hans-Christian Noren Egtvedt
 E: egtvedt@samfundet.no
 D: AVR32 architecture maintainer.
 N: Heiko Eißfeldt
 E: heiko@colossus.escape.de heiko@unifix.de
 D: verify_area stuff, generic SCSI fixes
@@ -3398,6 +3402,10 @@ S: Suite 101
 S: Markham, Ontario L3R 2Z6
 S: Canada
 N: Haavard Skinnemoen
 M: Haavard Skinnemoen <hskinnemoen@gmail.com>
 D: AVR32 architecture port to Linux and maintainer.
 N: Rick Sladkey
 E: jrs@world.std.com
 D: utility hacker: Emacs, NFS server, mount, kmem-ps, UPS debugger, strace, GDB
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -412,6 +412,8 @@ sysctl/
 	- directory with info on the /proc/sys/* files.
 target/
 	- directory with info on generating TCM v4 fabric .ko modules
 tee.txt
 	- info on the TEE subsystem and drivers
 this_cpu_ops.txt
 	- List rationale behind and the way to use this_cpu operations.
 thermal/
--- a/Documentation/ABI/obsolete/sysfs-firmware-acpi
+++ b/Documentation/ABI/obsolete/sysfs-firmware-acpi
@@ -0,0 +1,8 @@
 What:		/sys/firmware/acpi/hotplug/force_remove
 Date:		Mar 2017
 Contact:	Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 Description:
 		Since the force_remove is inherently broken and dangerous to
 		use for some hotplugable resources like memory (because ignoring
 		the offline failure might lead to memory corruption and crashes)
 		enabling this knob is not safe and thus unsupported.
--- a/Documentation/ABI/stable/sysfs-bus-usb
+++ b/Documentation/ABI/stable/sysfs-bus-usb
@@ -9,7 +9,7 @@ Description:
 		hubs this facility is always enabled and their device
 		directories will not contain this file.
-		For more information, see Documentation/usb/persist.txt.
+		For more information, see Documentation/driver-api/usb/persist.rst.
 What:		/sys/bus/usb/devices/.../power/autosuspend
 Date:		March 2007
--- a/Documentation/ABI/stable/vdso
+++ b/Documentation/ABI/stable/vdso
@@ -16,7 +16,8 @@ The vDSO uses symbol versioning; whenever you request a symbol from the
 vDSO, specify the version you are expecting.
 Programs that dynamically link to glibc will use the vDSO automatically.
-Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
+Otherwise, you can use the reference parser in
 tools/testing/selftests/vDSO/parse_vdso.c.
 Unless otherwise noted, the set of symbols with any given version and the
 ABI of those symbols is considered stable.  It may vary across architectures,
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
+		Will always return 0.  Don't rely on any specific behavior
-		stale or random data when a previously discarded block
+		for discards, and don't read this file.
 		is read back. This can cause problems if the filesystem
 		expects discarded blocks to be explicitly cleared. If a
 		device reports that it deterministically returns zeroes
 		when a discarded area is read the discard_zeroes_data
 		parameter will be set to one. Otherwise it will be 0 and
 		the result of reading a discarded area is undefined.
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -55,6 +55,7 @@ Description:
 		then it is to be found in the base device directory.
 What:		/sys/bus/iio/devices/iio:deviceX/sampling_frequency_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_proximity_sampling_frequency_available
 What:		/sys/.../iio:deviceX/buffer/sampling_frequency_available
 What:		/sys/bus/iio/devices/triggerX/sampling_frequency_available
 KernelVersion:	2.6.35
@@ -1593,7 +1594,7 @@ Description:
 		can be processed to siemens per meter.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_raw
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw counter device counts from channel Y. For quadrature
@@ -1601,10 +1602,24 @@ Description:
 		the counts of a single quadrature signal phase from channel Y.
 What:		/sys/bus/iio/devices/iio:deviceX/in_indexY_raw
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw counter device index value from channel Y. This attribute
 		provides an absolute positional reference (e.g. a pulse once per
 		revolution) which may be used to home positional systems as
 		required.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
 KernelVersion:	4.12
 Contact:	linux-iio@vger.kernel.org
 Description:
 		A list of possible counting directions which are:
 		- "up"	: counter device is increasing.
 		- "down": counter device is decreasing.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
 KernelVersion:	4.12
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw counter device counters direction for channel Y.
--- a/Documentation/ABI/testing/sysfs-bus-iio-adc-max9611
+++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-max9611
@@ -0,0 +1,17 @@
 What:		/sys/bus/iio/devices/iio:deviceX/in_power_shunt_resistor
 Date:		March 2017
 KernelVersion:	4.12
 Contact:	linux-iio@vger.kernel.org
 Description: 	The value of the shunt resistor used to compute power drain on
                common input voltage pin (RS+). In Ohms.
 What:		/sys/bus/iio/devices/iio:deviceX/in_current_shunt_resistor
 Date:		March 2017
 KernelVersion:	4.12
 Contact:	linux-iio@vger.kernel.org
 Description: 	The value of the shunt resistor used to compute current flowing
                between RS+ and RS- voltage sense inputs. In Ohms.
 These attributes describe a single physical component, exposed as two distinct
 attributes as it is used to calculate two different values: power load and
 current flowing between RS+ and RS- inputs.
--- a/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8
+++ b/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8
@@ -1,24 +1,16 @@
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_count_mode_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_noise_error_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_index_index_polarity_available
 What:		/sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Discrete set of available values for the respective counter
 		configuration are listed in this file.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
 KernelVersion:	4.9
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Read-only attribute that indicates whether the counter for
 		channel Y is counting up or down.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Count mode for channel Y. Four count modes are available:
@@ -52,7 +44,7 @@ Description:
 			continuously throughout.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Read-only attribute that indicates whether excessive noise is
@@ -60,14 +52,14 @@ Description:
 		irrelevant in non-quadrature clock mode.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_preset
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		If the counter device supports preset registers, the preset
 		count for channel Y is provided by this attribute.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Configure channel Y counter for non-quadrature or quadrature
@@ -88,7 +80,7 @@ Description:
 			decoded for UP/DN clock.
 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Whether to set channel Y counter with channel Y preset value
@@ -96,14 +88,14 @@ Description:
 		Valid attribute values are boolean.
 What:		/sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Active level of channel Y index input; irrelevant in
 		non-synchronous load mode.
 What:		/sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
-KernelVersion:	4.9
+KernelVersion:	4.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Configure channel Y counter for non-synchronous or synchronous
--- a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
+++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
@@ -3,11 +3,15 @@ KernelVersion:	4.11
 Contact:	benjamin.gaignard@st.com
 Description:
 		Reading returns the list possible master modes which are:
-		- "reset"     :	The UG bit from the TIMx_EGR register is used as trigger output (TRGO).
+		- "reset"     :	The UG bit from the TIMx_EGR register is
-		- "enable"    : The Counter Enable signal CNT_EN is used as trigger output.
+				used as trigger output (TRGO).
 		- "enable"    : The Counter Enable signal CNT_EN is used
 				as trigger output.
 		- "update"    : The update event is selected as trigger output.
-				For instance a master timer can then be used as a prescaler for a slave timer.
+				For instance a master timer can then be used
-		- "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set.
+				as a prescaler for a slave timer.
 		- "compare_pulse" : The trigger output send a positive pulse
 				    when the CC1IF flag is to be set.
 		- "OC1REF"    : OC1REF signal is used as trigger output.
 		- "OC2REF"    : OC2REF signal is used as trigger output.
 		- "OC3REF"    : OC3REF signal is used as trigger output.
@@ -27,3 +31,62 @@ Description:
 		Reading returns the current sampling frequency.
 		Writing an value different of 0 set and start sampling.
 		Writing 0 stop sampling.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count0_preset
 KernelVersion:	4.12
 Contact:	benjamin.gaignard@st.com
 Description:
 		Reading returns the current preset value.
 		Writing sets the preset value.
 		When counting up the counter starts from 0 and fires an
 		event when reach preset value.
 		When counting down the counter start from preset value
 		and fire event when reach 0.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
 KernelVersion:	4.12
 Contact:	benjamin.gaignard@st.com
 Description:
 		Reading returns the list possible quadrature modes.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count0_quadrature_mode
 KernelVersion:	4.12
 Contact:	benjamin.gaignard@st.com
 Description:
 		Configure the device counter quadrature modes:
 		channel_A:
 			Encoder A input servers as the count input and B as
 			the UP/DOWN direction control input.
 		channel_B:
 			Encoder B input serves as the count input and A as
 			the UP/DOWN direction control input.
 		quadrature:
 			Encoder A and B inputs are mixed to get direction
 			and count with a scale of 0.25.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count_enable_mode_available
 KernelVersion:	4.12
 Contact:	benjamin.gaignard@st.com
 Description:
 		Reading returns the list possible enable modes.
 What:		/sys/bus/iio/devices/iio:deviceX/in_count0_enable_mode
 KernelVersion:	4.12
 Contact:	benjamin.gaignard@st.com
 Description:
 		Configure the device counter enable modes, in all case
 		counting direction is set by in_count0_count_direction
 		attribute and the counter is clocked by the internal clock.
 		always:
 			Counter is always ON.
 		gated:
 			Counting is enabled when connected trigger signal
 			level is high else counting is disabled.
 		triggered:
 			Counting is enabled on rising edge of the connected
 			trigger, and remains enabled for the duration of this
 			selected mode.
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -299,5 +299,27 @@ What:		/sys/bus/pci/devices/.../revision
 Date:		November 2016
 Contact:	Emil Velikov <emil.l.velikov@gmail.com>
 Description:
-		This file contains the revision field of the the PCI device.
+		This file contains the revision field of the PCI device.
 		The value comes from device config space. The file is read only.
 What:		/sys/bus/pci/devices/.../sriov_drivers_autoprobe
 Date:		April 2017
 Contact:	Bodong Wang<bodong@mellanox.com>
 Description:
 		This file is associated with the PF of a device that
 		supports SR-IOV.  It determines whether newly-enabled VFs
 		are immediately bound to a driver.  It initially contains
 		1, which means the kernel automatically binds VFs to a
 		compatible driver immediately after they are enabled.  If
 		an application writes 0 to the file before enabling VFs,
 		the kernel will not bind VFs to a driver.
 		A typical use case is to write 0 to this file, then enable
 		VFs, then assign the newly-created VFs to virtual machines.
 		Note that changing this file does not affect already-
 		enabled VFs.  In this scenario, the user must first disable
 		the VFs, write 0 to sriov_drivers_autoprobe, then re-enable
 		the VFs.
 		This is similar to /sys/bus/pci/drivers_autoprobe, but
 		affects only the VFs associated with a specific PF.
--- a/Documentation/ABI/testing/sysfs-class-net-qmi
+++ b/Documentation/ABI/testing/sysfs-class-net-qmi
@@ -21,3 +21,30 @@ Description:
 		is responsible for coordination of driver and firmware
 		link framing mode, changing this setting to 'Y' if the
 		firmware is configured for 'raw-ip' mode.
 What:		/sys/class/net/<iface>/qmi/add_mux
 Date:		March 2017
 KernelVersion:	4.11
 Contact:	Bjørn Mork <bjorn@mork.no>
 Description:
 		Unsigned integer.
 		Write a number ranging from 1 to 127 to add a qmap mux
 		based network device, supported by recent Qualcomm based
 		modems.
 		The network device will be called qmimux.
 		Userspace is in charge of managing the qmux network device
 		activation and data stream setup on the modem side by
 		using the proper QMI protocol requests.
 What:		/sys/class/net/<iface>/qmi/del_mux
 Date:		March 2017
 KernelVersion:	4.11
 Contact:	Bjørn Mork <bjorn@mork.no>
 Description:
 		Unsigned integer.
 		Write a number ranging from 1 to 127 to delete a previously
 		created qmap mux based network device.
--- a/Documentation/ABI/testing/sysfs-class-switchtec
+++ b/Documentation/ABI/testing/sysfs-class-switchtec
@@ -0,0 +1,96 @@
 switchtec - Microsemi Switchtec PCI Switch Management Endpoint
 For details on this subsystem look at Documentation/switchtec.txt.
 What: 		/sys/class/switchtec
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	The switchtec class subsystem folder.
 		Each registered switchtec driver is represented by a switchtecX
 		subfolder (X being an integer >= 0).
 What:		/sys/class/switchtec/switchtec[0-9]+/component_id
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Component identifier as stored in the hardware (eg. PM8543)
 		(read only)
 Values: 	arbitrary string.
 What:		/sys/class/switchtec/switchtec[0-9]+/component_revision
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Component revision stored in the hardware (read only)
 Values: 	integer.
 What:		/sys/class/switchtec/switchtec[0-9]+/component_vendor
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Component vendor as stored in the hardware (eg. MICROSEM)
 		(read only)
 Values: 	arbitrary string.
 What:		/sys/class/switchtec/switchtec[0-9]+/device_version
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Device version as stored in the hardware (read only)
 Values: 	integer.
 What:		/sys/class/switchtec/switchtec[0-9]+/fw_version
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Currently running firmware version (read only)
 Values: 	integer (in hexadecimal).
 What:		/sys/class/switchtec/switchtec[0-9]+/partition
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Partition number for this device in the switch (read only)
 Values: 	integer.
 What:		/sys/class/switchtec/switchtec[0-9]+/partition_count
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Total number of partitions in the switch (read only)
 Values: 	integer.
 What:		/sys/class/switchtec/switchtec[0-9]+/product_id
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Product identifier as stored in the hardware (eg. PSX 48XG3)
 		(read only)
 Values: 	arbitrary string.
 What:		/sys/class/switchtec/switchtec[0-9]+/product_revision
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Product revision stored in the hardware (eg. RevB)
 		(read only)
 Values: 	arbitrary string.
 What:		/sys/class/switchtec/switchtec[0-9]+/product_vendor
 Date:		05-Jan-2017
 KernelVersion:	v4.11
 Contact:	Logan Gunthorpe <logang@deltatee.com>
 Description:	Product vendor as stored in the hardware (eg. MICROSEM)
 		(read only)
 Values: 	arbitrary string.
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -0,0 +1,276 @@
 USB Type-C port devices (eg. /sys/class/typec/port0/)
 What:		/sys/class/typec/<port>/data_role
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		The supported USB data roles. This attribute can be used for
 		requesting data role swapping on the port. Swapping is supported
 		as synchronous operation, so write(2) to the attribute will not
 		return until the operation has finished. The attribute is
 		notified about role changes so that poll(2) on the attribute
 		wakes up. Change on the role will also generate uevent
 		KOBJ_CHANGE on the port. The current role is show in brackets,
 		for example "[host] device" when DRP port is in host mode.
 		Valid values: host, device
 What:		/sys/class/typec/<port>/power_role
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		The supported power roles. This attribute can be used to request
 		power role swap on the port when the port supports USB Power
 		Delivery. Swapping is supported as synchronous operation, so
 		write(2) to the attribute will not return until the operation
 		has finished. The attribute is notified about role changes so
 		that poll(2) on the attribute wakes up. Change on the role will
 		also generate uevent KOBJ_CHANGE. The current role is show in
 		brackets, for example "[source] sink" when in source mode.
 		Valid values: source, sink
 What:		/sys/class/typec/<port>/vconn_source
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows is the port VCONN Source. This attribute can be used to
 		request VCONN swap to change the VCONN Source during connection
 		when both the port and the partner support USB Power Delivery.
 		Swapping is supported as synchronous operation, so write(2) to
 		the attribute will not return until the operation has finished.
 		The attribute is notified about VCONN source changes so that
 		poll(2) on the attribute wakes up. Change on VCONN source also
 		generates uevent KOBJ_CHANGE.
 		Valid values:
 		- "no" when the port is not the VCONN Source
 		- "yes" when the port is the VCONN Source
 What:		/sys/class/typec/<port>/power_operation_mode
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows the current power operational mode the port is in. The
 		power operation mode means current level for VBUS. In case USB
 		Power Delivery communication is used for negotiating the levels,
 		power operation mode should show "usb_power_delivery".
 		Valid values:
 		- default
 		- 1.5A
 		- 3.0A
 		- usb_power_delivery
 What:		/sys/class/typec/<port>/preferred_role
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		The user space can notify the driver about the preferred role.
 		It should be handled as enabling of Try.SRC or Try.SNK, as
 		defined in USB Type-C specification, in the port drivers. By
 		default the preferred role should come from the platform.
 		Valid values: source, sink, none (to remove preference)
 What:		/sys/class/typec/<port>/supported_accessory_modes
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Space separated list of accessory modes, defined in the USB
 		Type-C specification, the port supports.
 What:		/sys/class/typec/<port>/usb_power_delivery_revision
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Revision number of the supported USB Power Delivery
 		specification, or 0 when USB Power Delivery is not supported.
 What:		/sys/class/typec/<port>/usb_typec_revision
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Revision number of the supported USB Type-C specification.
 USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
 What:		/sys/class/typec/<port>-partner/accessory_mode
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows the Accessory Mode name when the partner is an Accessory.
 		The Accessory Modes are defined in USB Type-C Specification.
 What:		/sys/class/typec/<port>-partner/supports_usb_power_delivery
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows if the partner supports USB Power Delivery communication:
 		Valid values: yes, no
 What:		/sys/class/typec/<port>-partner>/identity/
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		This directory appears only if the port device driver is capable
 		of showing the result of Discover Identity USB power delivery
 		command. That will not always be possible even when USB power
 		delivery is supported, for example when USB power delivery
 		communication for the port is mostly handled in firmware. If the
 		directory exists, it will have an attribute file for every VDO
 		in Discover Identity command result.
 What:		/sys/class/typec/<port>-partner/identity/id_header
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		ID Header VDO part of Discover Identity command result. The
 		value will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 What:		/sys/class/typec/<port>-partner/identity/cert_stat
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Cert Stat VDO part of Discover Identity command result. The
 		value will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 What:		/sys/class/typec/<port>-partner/identity/product
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Product VDO part of Discover Identity command result. The value
 		will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 USB Type-C cable devices (eg. /sys/class/typec/port0-cable/)
 Note: Electronically Marked Cables will have a device also for one cable plug
 (eg. /sys/class/typec/port0-plug0). If the cable is active and has also SOP
 Double Prime controller (USB Power Deliver specification ch. 2.4) it will have
 second device also for the other plug. Both plugs may have alternate modes as
 described in USB Type-C and USB Power Delivery specifications.
 What:		/sys/class/typec/<port>-cable/type
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows if the cable is active.
 		Valid values: active, passive
 What:		/sys/class/typec/<port>-cable/plug_type
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows type of the plug on the cable:
 		- type-a - Standard A
 		- type-b - Standard B
 		- type-c
 		- captive
 What:		/sys/class/typec/<port>-cable/identity/
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		This directory appears only if the port device driver is capable
 		of showing the result of Discover Identity USB power delivery
 		command. That will not always be possible even when USB power
 		delivery is supported. If the directory exists, it will have an
 		attribute for every VDO returned by Discover Identity command.
 What:		/sys/class/typec/<port>-cable/identity/id_header
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		ID Header VDO part of Discover Identity command result. The
 		value will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 What:		/sys/class/typec/<port>-cable/identity/cert_stat
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Cert Stat VDO part of Discover Identity command result. The
 		value will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 What:		/sys/class/typec/<port>-cable/identity/product
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Product VDO part of Discover Identity command result. The value
 		will show 0 until Discover Identity command result becomes
 		available. The value can be polled.
 Alternate Mode devices.
 The alternate modes will have Standard or Vendor ID (SVID) assigned by USB-IF.
 The ports, partners and cable plugs can have alternate modes. A supported SVID
 will consist of a set of modes. Every SVID a port/partner/plug supports will
 have a device created for it, and every supported mode for a supported SVID will
 have its own directory under that device. Below <dev> refers to the device for
 the alternate mode.
 What:		/sys/class/typec/<port|partner|cable>/<dev>/svid
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		The SVID (Standard or Vendor ID) assigned by USB-IF for this
 		alternate mode.
 What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Every supported mode will have its own directory. The name of
 		a mode will be "mode<index>" (for example mode1), where <index>
 		is the actual index to the mode VDO returned by Discover Modes
 		USB power delivery command.
 What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/description
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows description of the mode. The description is optional for
 		the drivers, just like with the Billboard Devices.
 What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/vdo
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows the VDO in hexadecimal returned by Discover Modes command
 		for this mode.
 What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/active
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Shows if the mode is active or not. The attribute can be used
 		for entering/exiting the mode with partners and cable plugs, and
 		with the port alternate modes it can be used for disabling
 		support for specific alternate modes. Entering/exiting modes is
 		supported as synchronous operation so write(2) to the attribute
 		does not return until the enter/exit mode operation has
 		finished. The attribute is notified when the mode is
 		entered/exited so poll(2) on the attribute wakes up.
 		Entering/exiting a mode will also generate uevent KOBJ_CHANGE.
 		Valid values: yes, no
 What:		/sys/class/typec/<port>/<dev>/mode<index>/supported_roles
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		Space separated list of the supported roles.
 		This attribute is available for the devices describing the
 		alternate modes a port supports, and it will not be exposed with
 		the devices presenting the alternate modes the partners or cable
 		plugs support.
 		Valid values: source, sink
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -366,3 +366,10 @@ Contact:	Linux ARM Kernel Mailing list <linux-arm-kernel@lists.infradead.org>
 Description:	AArch64 CPU registers
 		'identification' directory exposes the CPU ID registers for
 		 identifying model and revision of the CPU.
 What:		/sys/devices/system/cpu/cpu#/cpu_capacity
 Date:		December 2016
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	information about CPUs heterogeneity.
 		cpu_capacity: capacity of cpu#.
--- a/Documentation/ABI/testing/sysfs-firmware-acpi
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -44,16 +44,6 @@ Description:
 		or 0 (unset).  Attempts to write any other values to it will
 		cause -EINVAL to be returned.
 What:		/sys/firmware/acpi/hotplug/force_remove
 Date:		May 2013
 Contact:	Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 Description:
 		The number in this file (0 or 1) determines whether (1) or not
 		(0) the ACPI subsystem will allow devices to be hot-removed even
 		if they cannot be put offline gracefully (from the kernel's
 		viewpoint).  That number can be changed by writing a boolean
 		value to this file.
 What:		/sys/firmware/acpi/interrupts/
 Date:		February 2008
 Contact:	Len Brown <lenb@kernel.org>
--- a/Documentation/ABI/testing/sysfs-kernel-livepatch
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -25,6 +25,14 @@ Description:
 		code is currently applied.  Writing 0 will disable the patch
 		while writing 1 will re-enable the patch.
 What:		/sys/kernel/livepatch/<patch>/transition
 Date:		Feb 2017
 KernelVersion:	4.12.0
 Contact:	live-patching@vger.kernel.org
 Description:
 		An attribute which indicates whether the patch is currently in
 		transition.
 What:		/sys/kernel/livepatch/<patch>/<object>
 Date:		Nov 2014
 KernelVersion:	3.19.0
--- a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
+++ b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
@@ -0,0 +1,9 @@
 What:		/sys/bus/platform/devices/ci_hdrc.0/role
 Date:		Mar 2017
 Contact:	Peter Chen <peter.chen@nxp.com>
 Description:
 		It returns string "gadget" or "host" when read it, it indicates
 		current controller role.
 		It will do role switch when write "gadget" or "host" to it.
 		Only controller at dual-role configuration supports writing.
--- a/Documentation/ABI/testing/sysfs-platform-renesas_usb3
+++ b/Documentation/ABI/testing/sysfs-platform-renesas_usb3
@@ -0,0 +1,15 @@
 What:		/sys/devices/platform/<renesas_usb3's name>/role
 Date:		March 2017
 KernelVersion:	4.13
 Contact:	Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
 Description:
 		This file can be read and write.
 		The file can show/change the drd mode of usb.
 		Write the following string to change the mode:
 		 "host" - switching mode from peripheral to host.
 		 "peripheral" - switching mode from host to peripheral.
 		Read the file, then it shows the following strings:
 		 "host" - The mode is host now.
 		 "peripheral" - The mode is peripheral now.
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -8,12 +8,11 @@
 DOCBOOKS := z8530book.xml  \
 	    kernel-hacking.xml kernel-locking.xml \
-	    writing_usb_driver.xml networking.xml \
+	    networking.xml \
-	    kernel-api.xml filesystems.xml lsm.xml kgdb.xml \
+	    filesystems.xml lsm.xml kgdb.xml \
-	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+	    libata.xml mtdnand.xml librs.xml rapidio.xml \
-	    genericirq.xml s390-drivers.xml scsi.xml \
+	    s390-drivers.xml scsi.xml \
-	    sh.xml w1.xml \
+	    sh.xml w1.xml
 	    writing_musb_glue_layer.xml
 ifeq ($(DOCBOOKS),)
@@ -62,11 +61,14 @@ MAN := $(patsubst %.xml, %.9, $(BOOKS))
 mandocs: $(MAN)
 	find $(obj)/man -name '*.9' | xargs gzip -nf
 # Default location for installed man pages
 export INSTALL_MAN_PATH = $(objtree)/usr
 installmandocs: mandocs
-	mkdir -p /usr/local/man/man9/
+	mkdir -p $(INSTALL_MAN_PATH)/man/man9/
 	find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
 		sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
-		xargs install -m 644 -t /usr/local/man/man9/
+		xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/
 # no-op for the DocBook toolchain
 epubdocs:
@@ -238,7 +240,9 @@ dochelp:
 	@echo  '  psdocs          - Postscript'
 	@echo  '  xmldocs         - XML DocBook'
 	@echo  '  mandocs         - man pages'
-	@echo  '  installmandocs  - install man pages generated by mandocs'
+	@echo  '  installmandocs  - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
 	 echo  '                    (default: $(INSTALL_MAN_PATH))'; \
 	 echo  ''
 	@echo  '  cleandocs       - clean all generated DocBook files'
 	@echo
 	@echo  '  make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
--- a/Documentation/DocBook/gadget.tmpl
+++ b/Documentation/DocBook/gadget.tmpl
@@ -1,793 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
 	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
 <book id="USB-Gadget-API">
  <bookinfo>
    <title>USB Gadget API for Linux</title>
    <date>20 August 2004</date>
    <edition>20 August 2004</edition>
    <legalnotice>
       <para>
 	 This documentation is free software; you can redistribute
 	 it and/or modify it under the terms of the GNU General Public
 	 License as published by the Free Software Foundation; either
 	 version 2 of the License, or (at your option) any later
 	 version.
       </para>
       <para>
 	 This program is distributed in the hope that it will be
 	 useful, but WITHOUT ANY WARRANTY; without even the implied
 	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 	 See the GNU General Public License for more details.
       </para>
       <para>
 	 You should have received a copy of the GNU General Public
 	 License along with this program; if not, write to the Free
 	 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 	 MA 02111-1307 USA
       </para>
       <para>
 	 For more details see the file COPYING in the source
 	 distribution of Linux.
       </para>
    </legalnotice>
    <copyright>
      <year>2003-2004</year>
      <holder>David Brownell</holder>
    </copyright>
    <author>
      <firstname>David</firstname> 
      <surname>Brownell</surname>
      <affiliation>
        <address><email>dbrownell@users.sourceforge.net</email></address>
      </affiliation>
    </author>
  </bookinfo>
 <toc></toc>
 <chapter id="intro"><title>Introduction</title>
 <para>This document presents a Linux-USB "Gadget"
 kernel mode
 API, for use within peripherals and other USB devices
 that embed Linux.
 It provides an overview of the API structure,
 and shows how that fits into a system development project.
 This is the first such API released on Linux to address
 a number of important problems, including: </para>
 <itemizedlist>
    <listitem><para>Supports USB 2.0, for high speed devices which
 	can stream data at several dozen megabytes per second.
 	</para></listitem>
    <listitem><para>Handles devices with dozens of endpoints just as
 	well as ones with just two fixed-function ones.  Gadget drivers
 	can be written so they're easy to port to new hardware.
 	</para></listitem>
    <listitem><para>Flexible enough to expose more complex USB device
 	capabilities such as multiple configurations, multiple interfaces,
 	composite devices,
 	and alternate interface settings.
 	</para></listitem>
    <listitem><para>USB "On-The-Go" (OTG) support, in conjunction
 	with updates to the Linux-USB host side.
 	</para></listitem>
    <listitem><para>Sharing data structures and API models with the
 	Linux-USB host side API.  This helps the OTG support, and
 	looks forward to more-symmetric frameworks (where the same
 	I/O model is used by both host and device side drivers).
 	</para></listitem>
    <listitem><para>Minimalist, so it's easier to support new device
 	controller hardware.  I/O processing doesn't imply large
 	demands for memory or CPU resources.
 	</para></listitem>
 </itemizedlist>
 <para>Most Linux developers will not be able to use this API, since they
 have USB "host" hardware in a PC, workstation, or server.
 Linux users with embedded systems are more likely to
 have USB peripheral hardware.
 To distinguish drivers running inside such hardware from the
 more familiar Linux "USB device drivers",
 which are host side proxies for the real USB devices,
 a different term is used:
 the drivers inside the peripherals are "USB gadget drivers".
 In USB protocol interactions, the device driver is the master
 (or "client driver")
 and the gadget driver is the slave (or "function driver").
 </para>
 <para>The gadget API resembles the host side Linux-USB API in that both
 use queues of request objects to package I/O buffers, and those requests
 may be submitted or canceled.
 They share common definitions for the standard USB
 <emphasis>Chapter 9</emphasis> messages, structures, and constants.
 Also, both APIs bind and unbind drivers to devices.
 The APIs differ in detail, since the host side's current
 URB framework exposes a number of implementation details
 and assumptions that are inappropriate for a gadget API.
 While the model for control transfers and configuration
 management is necessarily different (one side is a hardware-neutral master,
 the other is a hardware-aware slave), the endpoint I/0 API used here
 should also be usable for an overhead-reduced host side API.
 </para>
 </chapter>
 <chapter id="structure"><title>Structure of Gadget Drivers</title>
 <para>A system running inside a USB peripheral
 normally has at least three layers inside the kernel to handle
 USB protocol processing, and may have additional layers in
 user space code.
 The "gadget" API is used by the middle layer to interact
 with the lowest level (which directly handles hardware).
 </para>
 <para>In Linux, from the bottom up, these layers are:
 </para>
 <variablelist>
    <varlistentry>
        <term><emphasis>USB Controller Driver</emphasis></term>
 	<listitem>
 	<para>This is the lowest software level.
 	It is the only layer that talks to hardware,
 	through registers, fifos, dma, irqs, and the like.
 	The <filename>&lt;linux/usb/gadget.h&gt;</filename> API abstracts
 	the peripheral controller endpoint hardware.
 	That hardware is exposed through endpoint objects, which accept
 	streams of IN/OUT buffers, and through callbacks that interact
 	with gadget drivers.
 	Since normal USB devices only have one upstream
 	port, they only have one of these drivers.
 	The controller driver can support any number of different
 	gadget drivers, but only one of them can be used at a time.
 	</para>
 	<para>Examples of such controller hardware include
 	the PCI-based NetChip 2280 USB 2.0 high speed controller,
 	the SA-11x0 or PXA-25x UDC (found within many PDAs),
 	and a variety of other products.
 	</para>
 	</listitem></varlistentry>
    <varlistentry>
 	<term><emphasis>Gadget Driver</emphasis></term>
 	<listitem>
 	<para>The lower boundary of this driver implements hardware-neutral
 	USB functions, using calls to the controller driver.
 	Because such hardware varies widely in capabilities and restrictions,
 	and is used in embedded environments where space is at a premium,
 	the gadget driver is often configured at compile time
 	to work with endpoints supported by one particular controller.
 	Gadget drivers may be portable to several different controllers,
 	using conditional compilation.
 	(Recent kernels substantially simplify the work involved in
 	supporting new hardware, by <emphasis>autoconfiguring</emphasis>
 	endpoints automatically for many bulk-oriented drivers.)
 	Gadget driver responsibilities include:
 	</para>
 	<itemizedlist>
 	    <listitem><para>handling setup requests (ep0 protocol responses)
 		possibly including class-specific functionality
 		</para></listitem>
 	    <listitem><para>returning configuration and string descriptors
 		</para></listitem>
 	    <listitem><para>(re)setting configurations and interface
 		altsettings, including enabling and configuring endpoints
 		</para></listitem>
 	    <listitem><para>handling life cycle events, such as managing
 		bindings to hardware,
 		USB suspend/resume, remote wakeup,
 		and disconnection from the USB host.
 		</para></listitem>
 	    <listitem><para>managing IN and OUT transfers on all currently
 		enabled endpoints
 		</para></listitem>
 	</itemizedlist>
 	<para>
 	Such drivers may be modules of proprietary code, although
 	that approach is discouraged in the Linux community.
 	</para>
 	</listitem></varlistentry>
    <varlistentry>
 	<term><emphasis>Upper Level</emphasis></term>
 	<listitem>
 	<para>Most gadget drivers have an upper boundary that connects
 	to some Linux driver or framework in Linux.
 	Through that boundary flows the data which the gadget driver
 	produces and/or consumes through protocol transfers over USB.
 	Examples include:
 	</para>
 	<itemizedlist>
 	    <listitem><para>user mode code, using generic (gadgetfs)
 	        or application specific files in
 		<filename>/dev</filename>
 		</para></listitem>
 	    <listitem><para>networking subsystem (for network gadgets,
 		like the CDC Ethernet Model gadget driver)
 		</para></listitem>
 	    <listitem><para>data capture drivers, perhaps video4Linux or
 		 a scanner driver; or test and measurement hardware.
 		 </para></listitem>
 	    <listitem><para>input subsystem (for HID gadgets)
 		</para></listitem>
 	    <listitem><para>sound subsystem (for audio gadgets)
 		</para></listitem>
 	    <listitem><para>file system (for PTP gadgets)
 		</para></listitem>
 	    <listitem><para>block i/o subsystem (for usb-storage gadgets)
 		</para></listitem>
 	    <listitem><para>... and more </para></listitem>
 	</itemizedlist>
 	</listitem></varlistentry>
    <varlistentry>
 	<term><emphasis>Additional Layers</emphasis></term>
 	<listitem>
 	<para>Other layers may exist.
 	These could include kernel layers, such as network protocol stacks,
 	as well as user mode applications building on standard POSIX
 	system call APIs such as
 	<emphasis>open()</emphasis>, <emphasis>close()</emphasis>,
 	<emphasis>read()</emphasis> and <emphasis>write()</emphasis>.
 	On newer systems, POSIX Async I/O calls may be an option.
 	Such user mode code will not necessarily be subject to
 	the GNU General Public License (GPL).
 	</para>
 	</listitem></varlistentry>
 </variablelist>
 <para>OTG-capable systems will also need to include a standard Linux-USB
 host side stack,
 with <emphasis>usbcore</emphasis>,
 one or more <emphasis>Host Controller Drivers</emphasis> (HCDs),
 <emphasis>USB Device Drivers</emphasis> to support
 the OTG "Targeted Peripheral List",
 and so forth.
 There will also be an <emphasis>OTG Controller Driver</emphasis>,
 which is visible to gadget and device driver developers only indirectly.
 That helps the host and device side USB controllers implement the
 two new OTG protocols (HNP and SRP).
 Roles switch (host to peripheral, or vice versa) using HNP
 during USB suspend processing, and SRP can be viewed as a
 more battery-friendly kind of device wakeup protocol.
 </para>
 <para>Over time, reusable utilities are evolving to help make some
 gadget driver tasks simpler.
 For example, building configuration descriptors from vectors of
 descriptors for the configurations interfaces and endpoints is
 now automated, and many drivers now use autoconfiguration to
 choose hardware endpoints and initialize their descriptors.
 A potential example of particular interest
 is code implementing standard USB-IF protocols for
 HID, networking, storage, or audio classes.
 Some developers are interested in KDB or KGDB hooks, to let
 target hardware be remotely debugged.
 Most such USB protocol code doesn't need to be hardware-specific,
 any more than network protocols like X11, HTTP, or NFS are.
 Such gadget-side interface drivers should eventually be combined,
 to implement composite devices.
 </para>
 </chapter>
 <chapter id="api"><title>Kernel Mode Gadget API</title>
 <para>Gadget drivers declare themselves through a
 <emphasis>struct usb_gadget_driver</emphasis>, which is responsible for
 most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>.
 The response to a set_configuration usually involves
 enabling one or more of the <emphasis>struct usb_ep</emphasis> objects
 exposed by the gadget, and submitting one or more
 <emphasis>struct usb_request</emphasis> buffers to transfer data.
 Understand those four data types, and their operations, and
 you will understand how this API works.
 </para> 
 <note><title>Incomplete Data Type Descriptions</title>
 <para>This documentation was prepared using the standard Linux
 kernel <filename>docproc</filename> tool, which turns text
 and in-code comments into SGML DocBook and then into usable
 formats such as HTML or PDF.
 Other than the "Chapter 9" data types, most of the significant
 data types and functions are described here.
 </para>
 <para>However, docproc does not understand all the C constructs
 that are used, so some relevant information is likely omitted from
 what you are reading.  
 One example of such information is endpoint autoconfiguration.
 You'll have to read the header file, and use example source
 code (such as that for "Gadget Zero"), to fully understand the API.
 </para>
 <para>The part of the API implementing some basic
 driver capabilities is specific to the version of the
 Linux kernel that's in use.
 The 2.6 kernel includes a <emphasis>driver model</emphasis>
 framework that has no analogue on earlier kernels;
 so those parts of the gadget API are not fully portable.
 (They are implemented on 2.4 kernels, but in a different way.)
 The driver model state is another part of this API that is
 ignored by the kerneldoc tools.
 </para>
 </note>
 <para>The core API does not expose
 every possible hardware feature, only the most widely available ones.
 There are significant hardware features, such as device-to-device DMA
 (without temporary storage in a memory buffer)
 that would be added using hardware-specific APIs.
 </para>
 <para>This API allows drivers to use conditional compilation to handle
 endpoint capabilities of different hardware, but doesn't require that.
 Hardware tends to have arbitrary restrictions, relating to
 transfer types, addressing, packet sizes, buffering, and availability.
 As a rule, such differences only matter for "endpoint zero" logic
 that handles device configuration and management.
 The API supports limited run-time
 detection of capabilities, through naming conventions for endpoints.
 Many drivers will be able to at least partially autoconfigure
 themselves.
 In particular, driver init sections will often have endpoint
 autoconfiguration logic that scans the hardware's list of endpoints
 to find ones matching the driver requirements
 (relying on those conventions), to eliminate some of the most
 common reasons for conditional compilation.
 </para>
 <para>Like the Linux-USB host side API, this API exposes
 the "chunky" nature of USB messages:  I/O requests are in terms
 of one or more "packets", and packet boundaries are visible to drivers.
 Compared to RS-232 serial protocols, USB resembles
 synchronous protocols like HDLC
 (N bytes per frame, multipoint addressing, host as the primary
 station and devices as secondary stations)
 more than asynchronous ones
 (tty style:  8 data bits per frame, no parity, one stop bit).
 So for example the controller drivers won't buffer
 two single byte writes into a single two-byte USB IN packet,
 although gadget drivers may do so when they implement
 protocols where packet boundaries (and "short packets")
 are not significant.
 </para>
 <sect1 id="lifecycle"><title>Driver Life Cycle</title>
 <para>Gadget drivers make endpoint I/O requests to hardware without
 needing to know many details of the hardware, but driver
 setup/configuration code needs to handle some differences.
 Use the API like this:
 </para>
 <orderedlist numeration='arabic'>
 <listitem><para>Register a driver for the particular device side
 usb controller hardware,
 such as the net2280 on PCI (USB 2.0),
 sa11x0 or pxa25x as found in Linux PDAs,
 and so on.
 At this point the device is logically in the USB ch9 initial state
 ("attached"), drawing no power and not usable
 (since it does not yet support enumeration).
 Any host should not see the device, since it's not
 activated the data line pullup used by the host to
 detect a device, even if VBUS power is available.
 </para></listitem>
 <listitem><para>Register a gadget driver that implements some higher level
 device function.  That will then bind() to a usb_gadget, which
 activates the data line pullup sometime after detecting VBUS.
 </para></listitem>
 <listitem><para>The hardware driver can now start enumerating.
 The steps it handles are to accept USB power and set_address requests.
 Other steps are handled by the gadget driver.
 If the gadget driver module is unloaded before the host starts to
 enumerate, steps before step 7 are skipped.
 </para></listitem>
 <listitem><para>The gadget driver's setup() call returns usb descriptors,
 based both on what the bus interface hardware provides and on the
 functionality being implemented.
 That can involve alternate settings or configurations,
 unless the hardware prevents such operation.
 For OTG devices, each configuration descriptor includes
 an OTG descriptor.
 </para></listitem>
 <listitem><para>The gadget driver handles the last step of enumeration,
 when the USB host issues a set_configuration call.
 It enables all endpoints used in that configuration,
 with all interfaces in their default settings.
 That involves using a list of the hardware's endpoints, enabling each
 endpoint according to its descriptor.
 It may also involve using <function>usb_gadget_vbus_draw</function>
 to let more power be drawn from VBUS, as allowed by that configuration.
 For OTG devices, setting a configuration may also involve reporting
 HNP capabilities through a user interface.
 </para></listitem>
 <listitem><para>Do real work and perform data transfers, possibly involving
 changes to interface settings or switching to new configurations, until the
 device is disconnect()ed from the host.
 Queue any number of transfer requests to each endpoint.
 It may be suspended and resumed several times before being disconnected.
 On disconnect, the drivers go back to step 3 (above).
 </para></listitem>
 <listitem><para>When the gadget driver module is being unloaded,
 the driver unbind() callback is issued.  That lets the controller
 driver be unloaded.
 </para></listitem>
 </orderedlist>
 <para>Drivers will normally be arranged so that just loading the
 gadget driver module (or statically linking it into a Linux kernel)
 allows the peripheral device to be enumerated, but some drivers
 will defer enumeration until some higher level component (like
 a user mode daemon) enables it.
 Note that at this lowest level there are no policies about how
 ep0 configuration logic is implemented,
 except that it should obey USB specifications.
 Such issues are in the domain of gadget drivers,
 including knowing about implementation constraints
 imposed by some USB controllers
 or understanding that composite devices might happen to
 be built by integrating reusable components.
 </para>
 <para>Note that the lifecycle above can be slightly different
 for OTG devices.
 Other than providing an additional OTG descriptor in each
 configuration, only the HNP-related differences are particularly
 visible to driver code.
 They involve reporting requirements during the SET_CONFIGURATION
 request, and the option to invoke HNP during some suspend callbacks.
 Also, SRP changes the semantics of
 <function>usb_gadget_wakeup</function>
 slightly.
 </para>
 </sect1>
 <sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title>
 <para>Gadget drivers
 rely on common USB structures and constants
 defined in the
 <filename>&lt;linux/usb/ch9.h&gt;</filename>
 header file, which is standard in Linux 2.6 kernels.
 These are the same types and constants used by host
 side drivers (and usbcore).
 </para>
 !Iinclude/linux/usb/ch9.h
 </sect1>
 <sect1 id="core"><title>Core Objects and Methods</title>
 <para>These are declared in
 <filename>&lt;linux/usb/gadget.h&gt;</filename>,
 and are used by gadget drivers to interact with
 USB peripheral controller drivers.
 </para>
 	<!-- yeech, this is ugly in nsgmls PDF output.
 	     the PDF bookmark and refentry output nesting is wrong,
 	     and the member/argument documentation indents ugly.
 	     plus something (docproc?) adds whitespace before the
 	     descriptive paragraph text, so it can't line up right
 	     unless the explanations are trivial.
 	  -->
 !Iinclude/linux/usb/gadget.h
 </sect1>
 <sect1 id="utils"><title>Optional Utilities</title>
 <para>The core API is sufficient for writing a USB Gadget Driver,
 but some optional utilities are provided to simplify common tasks.
 These utilities include endpoint autoconfiguration.
 </para>
 !Edrivers/usb/gadget/usbstring.c
 !Edrivers/usb/gadget/config.c
 <!-- !Edrivers/usb/gadget/epautoconf.c -->
 </sect1>
 <sect1 id="composite"><title>Composite Device Framework</title>
 <para>The core API is sufficient for writing drivers for composite
 USB devices (with more than one function in a given configuration),
 and also multi-configuration devices (also more than one function,
 but not necessarily sharing a given configuration).
 There is however an optional framework which makes it easier to
 reuse and combine functions.
 </para>
 <para>Devices using this framework provide a <emphasis>struct
 usb_composite_driver</emphasis>, which in turn provides one or
 more <emphasis>struct usb_configuration</emphasis> instances.
 Each such configuration includes at least one
 <emphasis>struct usb_function</emphasis>, which packages a user
 visible role such as "network link" or "mass storage device".
 Management functions may also exist, such as "Device Firmware
 Upgrade".
 </para>
 !Iinclude/linux/usb/composite.h
 !Edrivers/usb/gadget/composite.c
 </sect1>
 <sect1 id="functions"><title>Composite Device Functions</title>
 <para>At this writing, a few of the current gadget drivers have
 been converted to this framework.
 Near-term plans include converting all of them, except for "gadgetfs".
 </para>
 !Edrivers/usb/gadget/function/f_acm.c
 !Edrivers/usb/gadget/function/f_ecm.c
 !Edrivers/usb/gadget/function/f_subset.c
 !Edrivers/usb/gadget/function/f_obex.c
 !Edrivers/usb/gadget/function/f_serial.c
 </sect1>
 </chapter>
 <chapter id="controllers"><title>Peripheral Controller Drivers</title>
 <para>The first hardware supporting this API was the NetChip 2280
 controller, which supports USB 2.0 high speed and is based on PCI.
 This is the <filename>net2280</filename> driver module.
 The driver supports Linux kernel versions 2.4 and 2.6;
 contact NetChip Technologies for development boards and product
 information.
 </para> 
 <para>Other hardware working in the "gadget" framework includes:
 Intel's PXA 25x and IXP42x series processors
 (<filename>pxa2xx_udc</filename>),
 Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>),
 Renesas SH7705/7727 (<filename>sh_udc</filename>),
 MediaQ 11xx (<filename>mq11xx_udc</filename>),
 Hynix HMS30C7202 (<filename>h7202_udc</filename>),
 National 9303/4 (<filename>n9604_udc</filename>),
 Texas Instruments OMAP (<filename>omap_udc</filename>),
 Sharp LH7A40x (<filename>lh7a40x_udc</filename>),
 and more.
 Most of those are full speed controllers.
 </para>
 <para>At this writing, there are people at work on drivers in
 this framework for several other USB device controllers,
 with plans to make many of them be widely available.
 </para>
 <!-- !Edrivers/usb/gadget/net2280.c -->
 <para>A partial USB simulator,
 the <filename>dummy_hcd</filename> driver, is available.
 It can act like a net2280, a pxa25x, or an sa11x0 in terms
 of available endpoints and device speeds; and it simulates
 control, bulk, and to some extent interrupt transfers.
 That lets you develop some parts of a gadget driver on a normal PC,
 without any special hardware, and perhaps with the assistance
 of tools such as GDB running with User Mode Linux.
 At least one person has expressed interest in adapting that
 approach, hooking it up to a simulator for a microcontroller.
 Such simulators can help debug subsystems where the runtime hardware
 is unfriendly to software development, or is not yet available.
 </para>
 <para>Support for other controllers is expected to be developed
 and contributed
 over time, as this driver framework evolves.
 </para>
 </chapter>
 <chapter id="gadget"><title>Gadget Drivers</title>
 <para>In addition to <emphasis>Gadget Zero</emphasis>
 (used primarily for testing and development with drivers
 for usb controller hardware), other gadget drivers exist.
 </para>
 <para>There's an <emphasis>ethernet</emphasis> gadget
 driver, which implements one of the most useful
 <emphasis>Communications Device Class</emphasis> (CDC) models.  
 One of the standards for cable modem interoperability even
 specifies the use of this ethernet model as one of two
 mandatory options.
 Gadgets using this code look to a USB host as if they're
 an Ethernet adapter.
 It provides access to a network where the gadget's CPU is one host,
 which could easily be bridging, routing, or firewalling
 access to other networks.
 Since some hardware can't fully implement the CDC Ethernet
 requirements, this driver also implements a "good parts only"
 subset of CDC Ethernet.
 (That subset doesn't advertise itself as CDC Ethernet,
 to avoid creating problems.)
 </para>
 <para>Support for Microsoft's <emphasis>RNDIS</emphasis>
 protocol has been contributed by Pengutronix and Auerswald GmbH.
 This is like CDC Ethernet, but it runs on more slightly USB hardware
 (but less than the CDC subset).
 However, its main claim to fame is being able to connect directly to
 recent versions of Windows, using drivers that Microsoft bundles
 and supports, making it much simpler to network with Windows.
 </para>
 <para>There is also support for user mode gadget drivers,
 using <emphasis>gadgetfs</emphasis>.
 This provides a <emphasis>User Mode API</emphasis> that presents
 each endpoint as a single file descriptor.  I/O is done using
 normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls.
 Familiar tools like GDB and pthreads can be used to
 develop and debug user mode drivers, so that once a robust
 controller driver is available many applications for it
 won't require new kernel mode software.
 Linux 2.6 <emphasis>Async I/O (AIO)</emphasis>
 support is available, so that user mode software
 can stream data with only slightly more overhead
 than a kernel driver.
 </para>
 <para>There's a USB Mass Storage class driver, which provides
 a different solution for interoperability with systems such
 as MS-Windows and MacOS.
 That <emphasis>Mass Storage</emphasis> driver uses a
 file or block device as backing store for a drive,
 like the <filename>loop</filename> driver.
 The USB host uses the BBB, CB, or CBI versions of the mass
 storage class specification, using transparent SCSI commands
 to access the data from the backing store.
 </para>
 <para>There's a "serial line" driver, useful for TTY style
 operation over USB.
 The latest version of that driver supports CDC ACM style
 operation, like a USB modem, and so on most hardware it can
 interoperate easily with MS-Windows.
 One interesting use of that driver is in boot firmware (like a BIOS),
 which can sometimes use that model with very small systems without
 real serial lines.
 </para>
 <para>Support for other kinds of gadget is expected to
 be developed and contributed
 over time, as this driver framework evolves.
 </para>
 </chapter>
 <chapter id="otg"><title>USB On-The-GO (OTG)</title>
 <para>USB OTG support on Linux 2.6 was initially developed
 by Texas Instruments for
 <ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx
 series processors.
 Other OTG systems should work in similar ways, but the
 hardware level details could be very different.
 </para> 
 <para>Systems need specialized hardware support to implement OTG,
 notably including a special <emphasis>Mini-AB</emphasis> jack
 and associated transceiver to support <emphasis>Dual-Role</emphasis>
 operation:
 they can act either as a host, using the standard
 Linux-USB host side driver stack,
 or as a peripheral, using this "gadget" framework.
 To do that, the system software relies on small additions
 to those programming interfaces,
 and on a new internal component (here called an "OTG Controller")
 affecting which driver stack connects to the OTG port.
 In each role, the system can re-use the existing pool of
 hardware-neutral drivers, layered on top of the controller
 driver interfaces (<emphasis>usb_bus</emphasis> or
 <emphasis>usb_gadget</emphasis>).
 Such drivers need at most minor changes, and most of the calls
 added to support OTG can also benefit non-OTG products.
 </para>
 <itemizedlist>
    <listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis>
 	flag, and use it to determine whether or not to include
 	an OTG descriptor in each of their configurations.
 	</para></listitem>
    <listitem><para>Gadget drivers may need changes to support the
 	two new OTG protocols, exposed in new gadget attributes
 	such as <emphasis>b_hnp_enable</emphasis> flag.
 	HNP support should be reported through a user interface
 	(two LEDs could suffice), and is triggered in some cases
 	when the host suspends the peripheral.
 	SRP support can be user-initiated just like remote wakeup,
 	probably by pressing the same button.
 	</para></listitem>
    <listitem><para>On the host side, USB device drivers need
 	to be taught to trigger HNP at appropriate moments, using
 	<function>usb_suspend_device()</function>.
 	That also conserves battery power, which is useful even
 	for non-OTG configurations.
 	</para></listitem>
    <listitem><para>Also on the host side, a driver must support the
 	OTG "Targeted Peripheral List".  That's just a whitelist,
 	used to reject peripherals not supported with a given
 	Linux OTG host.
 	<emphasis>This whitelist is product-specific;
 	each product must modify <filename>otg_whitelist.h</filename>
 	to match its interoperability specification.
 	</emphasis>
 	</para>
 	<para>Non-OTG Linux hosts, like PCs and workstations,
 	normally have some solution for adding drivers, so that
 	peripherals that aren't recognized can eventually be supported.
 	That approach is unreasonable for consumer products that may
 	never have their firmware upgraded, and where it's usually
 	unrealistic to expect traditional PC/workstation/server kinds
 	of support model to work.
 	For example, it's often impractical to change device firmware
 	once the product has been distributed, so driver bugs can't
 	normally be fixed if they're found after shipment.
 	</para></listitem>
 </itemizedlist>
 <para>
 Additional changes are needed below those hardware-neutral
 <emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis>
 driver interfaces; those aren't discussed here in any detail.
 Those affect the hardware-specific code for each USB Host or Peripheral
 controller, and how the HCD initializes (since OTG can be active only
 on a single port).
 They also involve what may be called an <emphasis>OTG Controller
 Driver</emphasis>, managing the OTG transceiver and the OTG state
 machine logic as well as much of the root hub behavior for the
 OTG port.
 The OTG controller driver needs to activate and deactivate USB
 controllers depending on the relevant device role.
 Some related changes were needed inside usbcore, so that it
 can identify OTG-capable devices and respond appropriately
 to HNP or SRP protocols.
 </para> 
 </chapter>
 </book>
 <!--
 	vim:syntax=sgml:sw=4
 -->
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -1,520 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
 	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
 <book id="Generic-IRQ-Guide">
 <bookinfo>
  <title>Linux generic IRQ handling</title>
  <authorgroup>
   <author>
    <firstname>Thomas</firstname>
    <surname>Gleixner</surname>
    <affiliation>
     <address>
      <email>tglx@linutronix.de</email>
     </address>
    </affiliation>
   </author>
   <author>
    <firstname>Ingo</firstname>
    <surname>Molnar</surname>
    <affiliation>
     <address>
      <email>mingo@elte.hu</email>
     </address>
    </affiliation>
   </author>
  </authorgroup>
  <copyright>
   <year>2005-2010</year>
   <holder>Thomas Gleixner</holder>
  </copyright>
  <copyright>
   <year>2005-2006</year>
   <holder>Ingo Molnar</holder>
  </copyright>
  <legalnotice>
   <para>
     This documentation is free software; you can redistribute
     it and/or modify it under the terms of the GNU General Public
     License version 2 as published by the Free Software Foundation.
   </para>
   <para>
     This program is distributed in the hope that it will be
     useful, but WITHOUT ANY WARRANTY; without even the implied
     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     See the GNU General Public License for more details.
   </para>
   <para>
     You should have received a copy of the GNU General Public
     License along with this program; if not, write to the Free
     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
     MA 02111-1307 USA
   </para>
   <para>
     For more details see the file COPYING in the source
     distribution of Linux.
   </para>
  </legalnotice>
 </bookinfo>
 <toc></toc>
  <chapter id="intro">
    <title>Introduction</title>
    <para>
 	The generic interrupt handling layer is designed to provide a
 	complete abstraction of interrupt handling for device drivers.
 	It is able to handle all the different types of interrupt controller
 	hardware. Device drivers use generic API functions to request, enable,
 	disable and free interrupts. The drivers do not have to know anything
 	about interrupt hardware details, so they can be used on different
 	platforms without code changes.
    </para>
    <para>
  	This documentation is provided to developers who want to implement
 	an interrupt subsystem based for their architecture, with the help
 	of the generic IRQ handling layer.
    </para>
  </chapter>
  <chapter id="rationale">
    <title>Rationale</title>
 	<para>
 	The original implementation of interrupt handling in Linux uses
 	the __do_IRQ() super-handler, which is able to deal with every
 	type of interrupt logic.
 	</para>
 	<para>
 	Originally, Russell King identified different types of handlers to
 	build a quite universal set for the ARM interrupt handler
 	implementation in Linux 2.5/2.6. He distinguished between:
 	<itemizedlist>
 	  <listitem><para>Level type</para></listitem>
 	  <listitem><para>Edge type</para></listitem>
 	  <listitem><para>Simple type</para></listitem>
 	</itemizedlist>
 	During the implementation we identified another type:
 	<itemizedlist>
 	  <listitem><para>Fast EOI type</para></listitem>
 	</itemizedlist>
 	In the SMP world of the __do_IRQ() super-handler another type
 	was identified:
 	<itemizedlist>
 	  <listitem><para>Per CPU type</para></listitem>
 	</itemizedlist>
 	</para>
 	<para>
 	This split implementation of high-level IRQ handlers allows us to
 	optimize the flow of the interrupt handling for each specific
 	interrupt type. This reduces complexity in that particular code path
 	and allows the optimized handling of a given type.
 	</para>
 	<para>
 	The original general IRQ implementation used hw_interrupt_type
 	structures and their ->ack(), ->end() [etc.] callbacks to
 	differentiate the flow control in the super-handler. This leads to
 	a mix of flow logic and low-level hardware logic, and it also leads
 	to unnecessary code duplication: for example in i386, there is an
 	ioapic_level_irq and an ioapic_edge_irq IRQ-type which share many
 	of the low-level details but have different flow handling.
 	</para>
 	<para>
 	A more natural abstraction is the clean separation of the
 	'irq flow' and the 'chip details'.
 	</para>
 	<para>
 	Analysing a couple of architecture's IRQ subsystem implementations
 	reveals that most of them can use a generic set of 'irq flow'
 	methods and only need to add the chip-level specific code.
 	The separation is also valuable for (sub)architectures
 	which need specific quirks in the IRQ flow itself but not in the
 	chip details - and thus provides a more transparent IRQ subsystem
 	design.
 	</para>
 	<para>
 	Each interrupt descriptor is assigned its own high-level flow
 	handler, which is normally one of the generic
 	implementations. (This high-level flow handler implementation also
 	makes it simple to provide demultiplexing handlers which can be
 	found in embedded platforms on various architectures.)
 	</para>
 	<para>
 	The separation makes the generic interrupt handling layer more
 	flexible and extensible. For example, an (sub)architecture can
 	use a generic IRQ-flow implementation for 'level type' interrupts
 	and add a (sub)architecture specific 'edge type' implementation.
 	</para>
 	<para>
 	To make the transition to the new model easier and prevent the
 	breakage of existing implementations, the __do_IRQ() super-handler
 	is still available. This leads to a kind of duality for the time
 	being. Over time the new model should be used in more and more
 	architectures, as it enables smaller and cleaner IRQ subsystems.
 	It's deprecated for three years now and about to be removed.
 	</para>
  </chapter>
  <chapter id="bugs">
    <title>Known Bugs And Assumptions</title>
    <para>
 	None (knock on wood).
    </para>
  </chapter>
  <chapter id="Abstraction">
    <title>Abstraction layers</title>
    <para>
 	There are three main levels of abstraction in the interrupt code:
 	<orderedlist>
 	  <listitem><para>High-level driver API</para></listitem>
 	  <listitem><para>High-level IRQ flow handlers</para></listitem>
 	  <listitem><para>Chip-level hardware encapsulation</para></listitem>
 	</orderedlist>
    </para>
    <sect1 id="Interrupt_control_flow">
 	<title>Interrupt control flow</title>
 	<para>
 	Each interrupt is described by an interrupt descriptor structure
 	irq_desc. The interrupt is referenced by an 'unsigned int' numeric
 	value which selects the corresponding interrupt description structure
 	in the descriptor structures array.
 	The descriptor structure contains status information and pointers
 	to the interrupt flow method and the interrupt chip structure
 	which are assigned to this interrupt.
 	</para>
 	<para>
 	Whenever an interrupt triggers, the low-level architecture code calls
 	into the generic interrupt code by calling desc->handle_irq().
 	This high-level IRQ handling function only uses desc->irq_data.chip
 	primitives referenced by the assigned chip descriptor structure.
 	</para>
    </sect1>
    <sect1 id="Highlevel_Driver_API">
 	<title>High-level Driver API</title>
 	<para>
 	  The high-level Driver API consists of following functions:
 	  <itemizedlist>
 	  <listitem><para>request_irq()</para></listitem>
 	  <listitem><para>free_irq()</para></listitem>
 	  <listitem><para>disable_irq()</para></listitem>
 	  <listitem><para>enable_irq()</para></listitem>
 	  <listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
 	  <listitem><para>synchronize_irq() (SMP only)</para></listitem>
 	  <listitem><para>irq_set_irq_type()</para></listitem>
 	  <listitem><para>irq_set_irq_wake()</para></listitem>
 	  <listitem><para>irq_set_handler_data()</para></listitem>
 	  <listitem><para>irq_set_chip()</para></listitem>
 	  <listitem><para>irq_set_chip_data()</para></listitem>
          </itemizedlist>
 	  See the autogenerated function documentation for details.
 	</para>
    </sect1>
    <sect1 id="Highlevel_IRQ_flow_handlers">
 	<title>High-level IRQ flow handlers</title>
 	<para>
 	  The generic layer provides a set of pre-defined irq-flow methods:
 	  <itemizedlist>
 	  <listitem><para>handle_level_irq</para></listitem>
 	  <listitem><para>handle_edge_irq</para></listitem>
 	  <listitem><para>handle_fasteoi_irq</para></listitem>
 	  <listitem><para>handle_simple_irq</para></listitem>
 	  <listitem><para>handle_percpu_irq</para></listitem>
 	  <listitem><para>handle_edge_eoi_irq</para></listitem>
 	  <listitem><para>handle_bad_irq</para></listitem>
 	  </itemizedlist>
 	  The interrupt flow handlers (either pre-defined or architecture
 	  specific) are assigned to specific interrupts by the architecture
 	  either during bootup or during device initialization.
 	</para>
 	<sect2 id="Default_flow_implementations">
 	<title>Default flow implementations</title>
 	    <sect3 id="Helper_functions">
 	 	<title>Helper functions</title>
 		<para>
 		The helper functions call the chip primitives and
 		are used by the default flow implementations.
 		The following helper functions are implemented (simplified excerpt):
 		<programlisting>
 default_enable(struct irq_data *data)
 {
 	desc->irq_data.chip->irq_unmask(data);
 }
 default_disable(struct irq_data *data)
 {
 	if (!delay_disable(data))
 		desc->irq_data.chip->irq_mask(data);
 }
 default_ack(struct irq_data *data)
 {
 	chip->irq_ack(data);
 }
 default_mask_ack(struct irq_data *data)
 {
 	if (chip->irq_mask_ack) {
 		chip->irq_mask_ack(data);
 	} else {
 		chip->irq_mask(data);
 		chip->irq_ack(data);
 	}
 }
 noop(struct irq_data *data))
 {
 }
 		</programlisting>
 	        </para>
 	    </sect3>
 	</sect2>
 	<sect2 id="Default_flow_handler_implementations">
 	<title>Default flow handler implementations</title>
 	    <sect3 id="Default_Level_IRQ_flow_handler">
 	 	<title>Default Level IRQ flow handler</title>
 		<para>
 		handle_level_irq provides a generic implementation
 		for level-triggered interrupts.
 		</para>
 		<para>
 		The following control flow is implemented (simplified excerpt):
 		<programlisting>
 desc->irq_data.chip->irq_mask_ack();
 handle_irq_event(desc->action);
 desc->irq_data.chip->irq_unmask();
 		</programlisting>
 		</para>
 	    </sect3>
 	    <sect3 id="Default_FASTEOI_IRQ_flow_handler">
 		<title>Default Fast EOI IRQ flow handler</title>
 		<para>
 		handle_fasteoi_irq provides a generic implementation
 		for interrupts, which only need an EOI at the end of
 		the handler.
 		</para>
 		<para>
 		The following control flow is implemented (simplified excerpt):
 		<programlisting>
 handle_irq_event(desc->action);
 desc->irq_data.chip->irq_eoi();
 		</programlisting>
 		</para>
 	    </sect3>
 	    <sect3 id="Default_Edge_IRQ_flow_handler">
 	 	<title>Default Edge IRQ flow handler</title>
 		<para>
 		handle_edge_irq provides a generic implementation
 		for edge-triggered interrupts.
 		</para>
 		<para>
 		The following control flow is implemented (simplified excerpt):
 		<programlisting>
 if (desc->status &amp; running) {
 	desc->irq_data.chip->irq_mask_ack();
 	desc->status |= pending | masked;
 	return;
 }
 desc->irq_data.chip->irq_ack();
 desc->status |= running;
 do {
 	if (desc->status &amp; masked)
 		desc->irq_data.chip->irq_unmask();
 	desc->status &amp;= ~pending;
 	handle_irq_event(desc->action);
 } while (status &amp; pending);
 desc->status &amp;= ~running;
 		</programlisting>
 		</para>
   	    </sect3>
 	    <sect3 id="Default_simple_IRQ_flow_handler">
 	 	<title>Default simple IRQ flow handler</title>
 		<para>
 		handle_simple_irq provides a generic implementation
 		for simple interrupts.
 		</para>
 		<para>
 		Note: The simple flow handler does not call any
 		handler/chip primitives.
 		</para>
 		<para>
 		The following control flow is implemented (simplified excerpt):
 		<programlisting>
 handle_irq_event(desc->action);
 		</programlisting>
 		</para>
   	    </sect3>
 	    <sect3 id="Default_per_CPU_flow_handler">
 	 	<title>Default per CPU flow handler</title>
 		<para>
 		handle_percpu_irq provides a generic implementation
 		for per CPU interrupts.
 		</para>
 		<para>
 		Per CPU interrupts are only available on SMP and
 		the handler provides a simplified version without
 		locking.
 		</para>
 		<para>
 		The following control flow is implemented (simplified excerpt):
 		<programlisting>
 if (desc->irq_data.chip->irq_ack)
 	desc->irq_data.chip->irq_ack();
 handle_irq_event(desc->action);
 if (desc->irq_data.chip->irq_eoi)
        desc->irq_data.chip->irq_eoi();
 		</programlisting>
 		</para>
   	    </sect3>
 	    <sect3 id="EOI_Edge_IRQ_flow_handler">
 	 	<title>EOI Edge IRQ flow handler</title>
 		<para>
 		handle_edge_eoi_irq provides an abnomination of the edge
 		handler which is solely used to tame a badly wreckaged
 		irq controller on powerpc/cell.
 		</para>
   	    </sect3>
 	    <sect3 id="BAD_IRQ_flow_handler">
 	 	<title>Bad IRQ flow handler</title>
 		<para>
 		handle_bad_irq is used for spurious interrupts which
 		have no real handler assigned..
 		</para>
   	    </sect3>
 	</sect2>
 	<sect2 id="Quirks_and_optimizations">
 	<title>Quirks and optimizations</title>
 	<para>
 	The generic functions are intended for 'clean' architectures and chips,
 	which have no platform-specific IRQ handling quirks. If an architecture
 	needs to implement quirks on the 'flow' level then it can do so by
 	overriding the high-level irq-flow handler.
 	</para>
 	</sect2>
 	<sect2 id="Delayed_interrupt_disable">
 	<title>Delayed interrupt disable</title>
 	<para>
 	This per interrupt selectable feature, which was introduced by Russell
 	King in the ARM interrupt implementation, does not mask an interrupt
 	at the hardware level when disable_irq() is called. The interrupt is
 	kept enabled and is masked in the flow handler when an interrupt event
 	happens. This prevents losing edge interrupts on hardware which does
 	not store an edge interrupt event while the interrupt is disabled at
 	the hardware level. When an interrupt arrives while the IRQ_DISABLED
 	flag is set, then the interrupt is masked at the hardware level and
 	the IRQ_PENDING bit is set. When the interrupt is re-enabled by
 	enable_irq() the pending bit is checked and if it is set, the
 	interrupt is resent either via hardware or by a software resend
 	mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
 	you want to use the delayed interrupt disable feature and your
 	hardware is not capable of retriggering	an interrupt.)
 	The delayed interrupt disable is not configurable.
 	</para>
 	</sect2>
    </sect1>
    <sect1 id="Chiplevel_hardware_encapsulation">
 	<title>Chip-level hardware encapsulation</title>
 	<para>
 	The chip-level hardware descriptor structure irq_chip
 	contains all the direct chip relevant functions, which
 	can be utilized by the irq flow implementations.
 	  <itemizedlist>
 	  <listitem><para>irq_ack()</para></listitem>
 	  <listitem><para>irq_mask_ack() - Optional, recommended for performance</para></listitem>
 	  <listitem><para>irq_mask()</para></listitem>
 	  <listitem><para>irq_unmask()</para></listitem>
 	  <listitem><para>irq_eoi() - Optional, required for EOI flow handlers</para></listitem>
 	  <listitem><para>irq_retrigger() - Optional</para></listitem>
 	  <listitem><para>irq_set_type() - Optional</para></listitem>
 	  <listitem><para>irq_set_wake() - Optional</para></listitem>
 	  </itemizedlist>
 	These primitives are strictly intended to mean what they say: ack means
 	ACK, masking means masking of an IRQ line, etc. It is up to the flow
 	handler(s) to use these basic units of low-level functionality.
 	</para>
    </sect1>
  </chapter>
  <chapter id="doirq">
     <title>__do_IRQ entry point</title>
     <para>
 	The original implementation __do_IRQ() was an alternative entry
 	point for all types of interrupts. It no longer exists.
     </para>
     <para>
 	This handler turned out to be not suitable for all
 	interrupt hardware and was therefore reimplemented with split
 	functionality for edge/level/simple/percpu interrupts. This is not
 	only a functional optimization. It also shortens code paths for
 	interrupts.
      </para>
  </chapter>
  <chapter id="locking">
     <title>Locking on SMP</title>
     <para>
 	The locking of chip registers is up to the architecture that
 	defines the chip primitives. The per-irq structure is
 	protected via desc->lock, by the generic layer.
     </para>
  </chapter>
  <chapter id="genericchip">
     <title>Generic interrupt chip</title>
     <para>
       To avoid copies of identical implementations of IRQ chips the
       core provides a configurable generic interrupt chip
       implementation. Developers should check carefully whether the
       generic chip fits their needs before implementing the same
       functionality slightly differently themselves.
     </para>
 !Ekernel/irq/generic-chip.c
  </chapter>
  <chapter id="structs">
     <title>Structures</title>
     <para>
     This chapter contains the autogenerated documentation of the structures which are
     used in the generic IRQ layer.
     </para>
 !Iinclude/linux/irq.h
 !Iinclude/linux/interrupt.h
  </chapter>
  <chapter id="pubfunctions">
     <title>Public Functions Provided</title>
     <para>
     This chapter contains the autogenerated documentation of the kernel API functions
      which are exported.
     </para>
 !Ekernel/irq/manage.c
 !Ekernel/irq/chip.c
  </chapter>
  <chapter id="intfunctions">
     <title>Internal Functions Provided</title>
     <para>
     This chapter contains the autogenerated documentation of the internal functions.
     </para>
 !Ikernel/irq/irqdesc.c
 !Ikernel/irq/handle.c
 !Ikernel/irq/chip.c
  </chapter>
  <chapter id="credits">
     <title>Credits</title>
 	<para>
 		The following people have contributed to this document:
 		<orderedlist>
 			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
 			<listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
 		</orderedlist>
 	</para>
  </chapter>
 </book>
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -1,331 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
 	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
 <book id="LinuxKernelAPI">
 <bookinfo>
  <title>The Linux Kernel API</title>
  <legalnotice>
   <para>
     This documentation is free software; you can redistribute
     it and/or modify it under the terms of the GNU General Public
     License as published by the Free Software Foundation; either
     version 2 of the License, or (at your option) any later
     version.
   </para>
   <para>
     This program is distributed in the hope that it will be
     useful, but WITHOUT ANY WARRANTY; without even the implied
     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     See the GNU General Public License for more details.
   </para>
   <para>
     You should have received a copy of the GNU General Public
     License along with this program; if not, write to the Free
     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
     MA 02111-1307 USA
   </para>
   <para>
     For more details see the file COPYING in the source
     distribution of Linux.
   </para>
  </legalnotice>
 </bookinfo>
 <toc></toc>
  <chapter id="adt">
     <title>Data Types</title>
     <sect1><title>Doubly Linked Lists</title>
 !Iinclude/linux/list.h
     </sect1>
  </chapter>
  <chapter id="libc">
     <title>Basic C Library Functions</title>
     <para>
       When writing drivers, you cannot in general use routines which are
       from the C Library.  Some of the functions have been found generally
       useful and they are listed below.  The behaviour of these functions
       may vary slightly from those defined by ANSI, and these deviations
       are noted in the text.
     </para>
     <sect1><title>String Conversions</title>
 !Elib/vsprintf.c
 !Finclude/linux/kernel.h kstrtol
 !Finclude/linux/kernel.h kstrtoul
 !Elib/kstrtox.c
     </sect1>
     <sect1><title>String Manipulation</title>
 <!-- All functions are exported at now
 X!Ilib/string.c
 -->
 !Elib/string.c
     </sect1>
     <sect1><title>Bit Operations</title>
 !Iarch/x86/include/asm/bitops.h
     </sect1>
  </chapter>
  <chapter id="kernel-lib">
     <title>Basic Kernel Library Functions</title>
     <para>
       The Linux kernel provides more basic utility functions.
     </para>
     <sect1><title>Bitmap Operations</title>
 !Elib/bitmap.c
 !Ilib/bitmap.c
     </sect1>
     <sect1><title>Command-line Parsing</title>
 !Elib/cmdline.c
     </sect1>
     <sect1 id="crc"><title>CRC Functions</title>
 !Elib/crc7.c
 !Elib/crc16.c
 !Elib/crc-itu-t.c
 !Elib/crc32.c
 !Elib/crc-ccitt.c
     </sect1>
     <sect1 id="idr"><title>idr/ida Functions</title>
 !Pinclude/linux/idr.h idr sync
 !Plib/idr.c IDA description
 !Elib/idr.c
     </sect1>
  </chapter>
  <chapter id="mm">
     <title>Memory Management in Linux</title>
     <sect1><title>The Slab Cache</title>
 !Iinclude/linux/slab.h
 !Emm/slab.c
 !Emm/util.c
     </sect1>
     <sect1><title>User Space Memory Access</title>
 !Iarch/x86/include/asm/uaccess_32.h
 !Earch/x86/lib/usercopy_32.c
     </sect1>
     <sect1><title>More Memory Management Functions</title>
 !Emm/readahead.c
 !Emm/filemap.c
 !Emm/memory.c
 !Emm/vmalloc.c
 !Imm/page_alloc.c
 !Emm/mempool.c
 !Emm/dmapool.c
 !Emm/page-writeback.c
 !Emm/truncate.c
     </sect1>
  </chapter>
  <chapter id="ipc">
     <title>Kernel IPC facilities</title>
     <sect1><title>IPC utilities</title>
 !Iipc/util.c
     </sect1>
  </chapter>
  <chapter id="kfifo">
     <title>FIFO Buffer</title>
     <sect1><title>kfifo interface</title>
 !Iinclude/linux/kfifo.h
     </sect1>
  </chapter>
  <chapter id="relayfs">
     <title>relay interface support</title>
     <para>
 	Relay interface support
 	is designed to provide an efficient mechanism for tools and
 	facilities to relay large amounts of data from kernel space to
 	user space.
     </para>
     <sect1><title>relay interface</title>
 !Ekernel/relay.c
 !Ikernel/relay.c
     </sect1>
  </chapter>
  <chapter id="modload">
     <title>Module Support</title>
     <sect1><title>Module Loading</title>
 !Ekernel/kmod.c
     </sect1>
     <sect1><title>Inter Module support</title>
        <para>
           Refer to the file kernel/module.c for more information.
        </para>
 <!-- FIXME: Removed for now since no structured comments in source
 X!Ekernel/module.c
 -->
     </sect1>
  </chapter>
  <chapter id="hardware">
     <title>Hardware Interfaces</title>
     <sect1><title>Interrupt Handling</title>
 !Ekernel/irq/manage.c
     </sect1>
     <sect1><title>DMA Channels</title>
 !Ekernel/dma.c
     </sect1>
     <sect1><title>Resources Management</title>
 !Ikernel/resource.c
 !Ekernel/resource.c
     </sect1>
     <sect1><title>MTRR Handling</title>
 !Earch/x86/kernel/cpu/mtrr/main.c
     </sect1>
     <sect1><title>PCI Support Library</title>
 !Edrivers/pci/pci.c
 !Edrivers/pci/pci-driver.c
 !Edrivers/pci/remove.c
 !Edrivers/pci/search.c
 !Edrivers/pci/msi.c
 !Edrivers/pci/bus.c
 !Edrivers/pci/access.c
 !Edrivers/pci/irq.c
 !Edrivers/pci/htirq.c
 <!-- FIXME: Removed for now since no structured comments in source
 X!Edrivers/pci/hotplug.c
 -->
 !Edrivers/pci/probe.c
 !Edrivers/pci/slot.c
 !Edrivers/pci/rom.c
 !Edrivers/pci/iov.c
 !Idrivers/pci/pci-sysfs.c
     </sect1>
     <sect1><title>PCI Hotplug Support Library</title>
 !Edrivers/pci/hotplug/pci_hotplug_core.c
     </sect1>
  </chapter>
  <chapter id="firmware">
     <title>Firmware Interfaces</title>
     <sect1><title>DMI Interfaces</title>
 !Edrivers/firmware/dmi_scan.c
     </sect1>
     <sect1><title>EDD Interfaces</title>
 !Idrivers/firmware/edd.c
     </sect1>
  </chapter>
  <chapter id="security">
     <title>Security Framework</title>
 !Isecurity/security.c
 !Esecurity/inode.c
  </chapter>
  <chapter id="audit">
     <title>Audit Interfaces</title>
 !Ekernel/audit.c
 !Ikernel/auditsc.c
 !Ikernel/auditfilter.c
  </chapter>
  <chapter id="accounting">
     <title>Accounting Framework</title>
 !Ikernel/acct.c
  </chapter>
  <chapter id="blkdev">
     <title>Block Devices</title>
 !Eblock/blk-core.c
 !Iblock/blk-core.c
 !Eblock/blk-map.c
 !Iblock/blk-sysfs.c
 !Eblock/blk-settings.c
 !Eblock/blk-exec.c
 !Eblock/blk-flush.c
 !Eblock/blk-lib.c
 !Eblock/blk-tag.c
 !Iblock/blk-tag.c
 !Eblock/blk-integrity.c
 !Ikernel/trace/blktrace.c
 !Iblock/genhd.c
 !Eblock/genhd.c
  </chapter>
  <chapter id="chrdev">
 	<title>Char devices</title>
 !Efs/char_dev.c
  </chapter>
  <chapter id="miscdev">
     <title>Miscellaneous Devices</title>
 !Edrivers/char/misc.c
  </chapter>
  <chapter id="clk">
     <title>Clock Framework</title>
     <para>
 	The clock framework defines programming interfaces to support
 	software management of the system clock tree.
 	This framework is widely used with System-On-Chip (SOC) platforms
 	to support power management and various devices which may need
 	custom clock rates.
 	Note that these "clocks" don't relate to timekeeping or real
 	time clocks (RTCs), each of which have separate frameworks.
 	These <structname>struct clk</structname> instances may be used
 	to manage for example a 96 MHz signal that is used to shift bits
 	into and out of peripherals or busses, or otherwise trigger
 	synchronous state machine transitions in system hardware.
     </para>
     <para>
 	Power management is supported by explicit software clock gating:
 	unused clocks are disabled, so the system doesn't waste power
 	changing the state of transistors that aren't in active use.
 	On some systems this may be backed by hardware clock gating,
 	where clocks are gated without being disabled in software.
 	Sections of chips that are powered but not clocked may be able
 	to retain their last state.
 	This low power state is often called a <emphasis>retention
 	mode</emphasis>.
 	This mode still incurs leakage currents, especially with finer
 	circuit geometries, but for CMOS circuits power is mostly used
 	by clocked state changes.
     </para>
     <para>
 	Power-aware drivers only enable their clocks when the device
 	they manage is in active use.  Also, system sleep states often
 	differ according to which clock domains are active:  while a
 	"standby" state may allow wakeup from several active domains, a
 	"mem" (suspend-to-RAM) state may require a more wholesale shutdown
 	of clocks derived from higher speed PLLs and oscillators, limiting
 	the number of possible wakeup event sources.  A driver's suspend
 	method may need to be aware of system-specific clock constraints
 	on the target sleep state.
     </para>
     <para>
        Some platforms support programmable clock generators.  These
 	can be used by external chips of various kinds, such as other
 	CPUs, multimedia codecs, and devices with strict requirements
 	for interface clocking.
     </para>
 !Iinclude/linux/clk.h
  </chapter>
 </book>
--- a/Documentation/DocBook/rapidio.tmpl
+++ b/Documentation/DocBook/rapidio.tmpl
@@ -128,9 +128,6 @@
     </sect1>
     <sect1 id="Device_model_support"><title>Device model support</title>
 !Idrivers/rapidio/rio-driver.c
     </sect1>
     <sect1 id="Sysfs_support"><title>Sysfs support</title>
 !Idrivers/rapidio/rio-sysfs.c
     </sect1>
     <sect1 id="PPC32_support"><title>PPC32 support</title>
 !Iarch/powerpc/sysdev/fsl_rio.c
--- a/Documentation/DocBook/writing_musb_glue_layer.tmpl
+++ b/Documentation/DocBook/writing_musb_glue_layer.tmpl
@@ -1,873 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
 	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
 <book id="Writing-MUSB-Glue-Layer">
 <bookinfo>
  <title>Writing an MUSB Glue Layer</title>
  <authorgroup>
   <author>
    <firstname>Apelete</firstname>
    <surname>Seketeli</surname>
    <affiliation>
     <address>
      <email>apelete at seketeli.net</email>
     </address>
    </affiliation>
   </author>
  </authorgroup>
  <copyright>
   <year>2014</year>
   <holder>Apelete Seketeli</holder>
  </copyright>
  <legalnotice>
   <para>
     This documentation is free software; you can redistribute it
     and/or modify it under the terms of the GNU General Public
     License as published by the Free Software Foundation; either
     version 2 of the License, or (at your option) any later version.
   </para>
   <para>
     This documentation is distributed in the hope that it will be
     useful, but WITHOUT ANY WARRANTY; without even the implied
     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     See the GNU General Public License for more details.
   </para>
   <para>
     You should have received a copy of the GNU General Public License
     along with this documentation; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     02111-1307 USA
   </para>
   <para>
     For more details see the file COPYING in the Linux kernel source
     tree.
   </para>
  </legalnotice>
 </bookinfo>
 <toc></toc>
  <chapter id="introduction">
    <title>Introduction</title>
    <para>
      The Linux MUSB subsystem is part of the larger Linux USB
      subsystem. It provides support for embedded USB Device Controllers
      (UDC) that do not use Universal Host Controller Interface (UHCI)
      or Open Host Controller Interface (OHCI).
    </para>
    <para>
      Instead, these embedded UDC rely on the USB On-the-Go (OTG)
      specification which they implement at least partially. The silicon
      reference design used in most cases is the Multipoint USB
      Highspeed Dual-Role Controller (MUSB HDRC) found in the Mentor
      Graphics Inventra™ design.
    </para>
    <para>
      As a self-taught exercise I have written an MUSB glue layer for
      the Ingenic JZ4740 SoC, modelled after the many MUSB glue layers
      in the kernel source tree. This layer can be found at
      drivers/usb/musb/jz4740.c. In this documentation I will walk
      through the basics of the jz4740.c glue layer, explaining the
      different pieces and what needs to be done in order to write your
      own device glue layer.
    </para>
  </chapter>
  <chapter id="linux-musb-basics">
    <title>Linux MUSB Basics</title>
    <para>
      To get started on the topic, please read USB On-the-Go Basics (see
      Resources) which provides an introduction of USB OTG operation at
      the hardware level. A couple of wiki pages by Texas Instruments
      and Analog Devices also provide an overview of the Linux kernel
      MUSB configuration, albeit focused on some specific devices
      provided by these companies. Finally, getting acquainted with the
      USB specification at USB home page may come in handy, with
      practical instance provided through the Writing USB Device Drivers
      documentation (again, see Resources).
    </para>
    <para>
      Linux USB stack is a layered architecture in which the MUSB
      controller hardware sits at the lowest. The MUSB controller driver
      abstract the MUSB controller hardware to the Linux USB stack.
    </para>
    <programlisting>
      ------------------------
      |                      | &lt;------- drivers/usb/gadget
      | Linux USB Core Stack | &lt;------- drivers/usb/host
      |                      | &lt;------- drivers/usb/core
      ------------------------
                 ⬍
     --------------------------
     |                        | &lt;------ drivers/usb/musb/musb_gadget.c
     | MUSB Controller driver | &lt;------ drivers/usb/musb/musb_host.c
     |                        | &lt;------ drivers/usb/musb/musb_core.c
     --------------------------
                 ⬍
  ---------------------------------
  | MUSB Platform Specific Driver |
  |                               | &lt;-- drivers/usb/musb/jz4740.c
  |       aka &quot;Glue Layer&quot;        |
  ---------------------------------
                 ⬍
  ---------------------------------
  |   MUSB Controller Hardware    |
  ---------------------------------
    </programlisting>
    <para>
      As outlined above, the glue layer is actually the platform
      specific code sitting in between the controller driver and the
      controller hardware.
    </para>
    <para>
      Just like a Linux USB driver needs to register itself with the
      Linux USB subsystem, the MUSB glue layer needs first to register
      itself with the MUSB controller driver. This will allow the
      controller driver to know about which device the glue layer
      supports and which functions to call when a supported device is
      detected or released; remember we are talking about an embedded
      controller chip here, so no insertion or removal at run-time.
    </para>
    <para>
      All of this information is passed to the MUSB controller driver
      through a platform_driver structure defined in the glue layer as:
    </para>
    <programlisting linenumbering="numbered">
 static struct platform_driver jz4740_driver = {
 	.probe		= jz4740_probe,
 	.remove		= jz4740_remove,
 	.driver		= {
 		.name	= "musb-jz4740",
 	},
 };
    </programlisting>
    <para>
      The probe and remove function pointers are called when a matching
      device is detected and, respectively, released. The name string
      describes the device supported by this glue layer. In the current
      case it matches a platform_device structure declared in
      arch/mips/jz4740/platform.c. Note that we are not using device
      tree bindings here.
    </para>
    <para>
      In order to register itself to the controller driver, the glue
      layer goes through a few steps, basically allocating the
      controller hardware resources and initialising a couple of
      circuits. To do so, it needs to keep track of the information used
      throughout these steps. This is done by defining a private
      jz4740_glue structure:
    </para>
    <programlisting linenumbering="numbered">
 struct jz4740_glue {
 	struct device           *dev;
 	struct platform_device  *musb;
 	struct clk		*clk;
 };
    </programlisting>
    <para>
      The dev and musb members are both device structure variables. The
      first one holds generic information about the device, since it's
      the basic device structure, and the latter holds information more
      closely related to the subsystem the device is registered to. The
      clk variable keeps information related to the device clock
      operation.
    </para>
    <para>
      Let's go through the steps of the probe function that leads the
      glue layer to register itself to the controller driver.
    </para>
    <para>
      N.B.: For the sake of readability each function will be split in
      logical parts, each part being shown as if it was independent from
      the others.
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_probe(struct platform_device *pdev)
 {
 	struct platform_device		*musb;
 	struct jz4740_glue		*glue;
 	struct clk                      *clk;
 	int				ret;
 	glue = devm_kzalloc(&amp;pdev->dev, sizeof(*glue), GFP_KERNEL);
 	if (!glue)
 		return -ENOMEM;
 	musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO);
 	if (!musb) {
 		dev_err(&amp;pdev->dev, "failed to allocate musb device\n");
 		return -ENOMEM;
 	}
 	clk = devm_clk_get(&amp;pdev->dev, "udc");
 	if (IS_ERR(clk)) {
 		dev_err(&amp;pdev->dev, "failed to get clock\n");
 		ret = PTR_ERR(clk);
 		goto err_platform_device_put;
 	}
 	ret = clk_prepare_enable(clk);
 	if (ret) {
 		dev_err(&amp;pdev->dev, "failed to enable clock\n");
 		goto err_platform_device_put;
 	}
 	musb->dev.parent		= &amp;pdev->dev;
 	glue->dev			= &amp;pdev->dev;
 	glue->musb			= musb;
 	glue->clk			= clk;
 	return 0;
 err_platform_device_put:
 	platform_device_put(musb);
 	return ret;
 }
    </programlisting>
    <para>
      The first few lines of the probe function allocate and assign the
      glue, musb and clk variables. The GFP_KERNEL flag (line 8) allows
      the allocation process to sleep and wait for memory, thus being
      usable in a blocking situation. The PLATFORM_DEVID_AUTO flag (line
      12) allows automatic allocation and management of device IDs in
      order to avoid device namespace collisions with explicit IDs. With
      devm_clk_get() (line 18) the glue layer allocates the clock -- the
      <literal>devm_</literal> prefix indicates that clk_get() is
      managed: it automatically frees the allocated clock resource data
      when the device is released -- and enable it.
    </para>
    <para>
      Then comes the registration steps:
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_probe(struct platform_device *pdev)
 {
 	struct musb_hdrc_platform_data	*pdata = &amp;jz4740_musb_platform_data;
 	pdata->platform_ops		= &amp;jz4740_musb_ops;
 	platform_set_drvdata(pdev, glue);
 	ret = platform_device_add_resources(musb, pdev->resource,
 					    pdev->num_resources);
 	if (ret) {
 		dev_err(&amp;pdev->dev, "failed to add resources\n");
 		goto err_clk_disable;
 	}
 	ret = platform_device_add_data(musb, pdata, sizeof(*pdata));
 	if (ret) {
 		dev_err(&amp;pdev->dev, "failed to add platform_data\n");
 		goto err_clk_disable;
 	}
 	return 0;
 err_clk_disable:
 	clk_disable_unprepare(clk);
 err_platform_device_put:
 	platform_device_put(musb);
 	return ret;
 }
    </programlisting>
    <para>
      The first step is to pass the device data privately held by the
      glue layer on to the controller driver through
      platform_set_drvdata() (line 7). Next is passing on the device
      resources information, also privately held at that point, through
      platform_device_add_resources() (line 9).
    </para>
    <para>
      Finally comes passing on the platform specific data to the
      controller driver (line 16). Platform data will be discussed in
      <link linkend="device-platform-data">Chapter 4</link>, but here
      we are looking at the platform_ops function pointer (line 5) in
      musb_hdrc_platform_data structure (line 3).  This function
      pointer allows the MUSB controller driver to know which function
      to call for device operation:
    </para>
    <programlisting linenumbering="numbered">
 static const struct musb_platform_ops jz4740_musb_ops = {
 	.init		= jz4740_musb_init,
 	.exit		= jz4740_musb_exit,
 };
    </programlisting>
    <para>
      Here we have the minimal case where only init and exit functions
      are called by the controller driver when needed. Fact is the
      JZ4740 MUSB controller is a basic controller, lacking some
      features found in other controllers, otherwise we may also have
      pointers to a few other functions like a power management function
      or a function to switch between OTG and non-OTG modes, for
      instance.
    </para>
    <para>
      At that point of the registration process, the controller driver
      actually calls the init function:
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_musb_init(struct musb *musb)
 {
 	musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2);
 	if (!musb->xceiv) {
 		pr_err("HS UDC: no transceiver configured\n");
 		return -ENODEV;
 	}
 	/* Silicon does not implement ConfigData register.
 	 * Set dyn_fifo to avoid reading EP config from hardware.
 	 */
 	musb->dyn_fifo = true;
 	musb->isr = jz4740_musb_interrupt;
 	return 0;
 }
    </programlisting>
    <para>
      The goal of jz4740_musb_init() is to get hold of the transceiver
      driver data of the MUSB controller hardware and pass it on to the
      MUSB controller driver, as usual. The transceiver is the circuitry
      inside the controller hardware responsible for sending/receiving
      the USB data. Since it is an implementation of the physical layer
      of the OSI model, the transceiver is also referred to as PHY.
    </para>
    <para>
      Getting hold of the MUSB PHY driver data is done with
      usb_get_phy() which returns a pointer to the structure
      containing the driver instance data. The next couple of
      instructions (line 12 and 14) are used as a quirk and to setup
      IRQ handling respectively. Quirks and IRQ handling will be
      discussed later in <link linkend="device-quirks">Chapter
      5</link> and <link linkend="handling-irqs">Chapter 3</link>.
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_musb_exit(struct musb *musb)
 {
 	usb_put_phy(musb->xceiv);
 	return 0;
 }
    </programlisting>
    <para>
      Acting as the counterpart of init, the exit function releases the
      MUSB PHY driver when the controller hardware itself is about to be
      released.
    </para>
    <para>
      Again, note that init and exit are fairly simple in this case due
      to the basic set of features of the JZ4740 controller hardware.
      When writing an musb glue layer for a more complex controller
      hardware, you might need to take care of more processing in those
      two functions.
    </para>
    <para>
      Returning from the init function, the MUSB controller driver jumps
      back into the probe function:
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_probe(struct platform_device *pdev)
 {
 	ret = platform_device_add(musb);
 	if (ret) {
 		dev_err(&amp;pdev->dev, "failed to register musb device\n");
 		goto err_clk_disable;
 	}
 	return 0;
 err_clk_disable:
 	clk_disable_unprepare(clk);
 err_platform_device_put:
 	platform_device_put(musb);
 	return ret;
 }
    </programlisting>
    <para>
      This is the last part of the device registration process where the
      glue layer adds the controller hardware device to Linux kernel
      device hierarchy: at this stage, all known information about the
      device is passed on to the Linux USB core stack.
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_remove(struct platform_device *pdev)
 {
 	struct jz4740_glue	*glue = platform_get_drvdata(pdev);
 	platform_device_unregister(glue->musb);
 	clk_disable_unprepare(glue->clk);
 	return 0;
 }
    </programlisting>
    <para>
      Acting as the counterpart of probe, the remove function unregister
      the MUSB controller hardware (line 5) and disable the clock (line
      6), allowing it to be gated.
    </para>
  </chapter>
  <chapter id="handling-irqs">
    <title>Handling IRQs</title>
    <para>
      Additionally to the MUSB controller hardware basic setup and
      registration, the glue layer is also responsible for handling the
      IRQs:
    </para>
    <programlisting linenumbering="numbered">
 static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci)
 {
 	unsigned long   flags;
 	irqreturn_t     retval = IRQ_NONE;
 	struct musb     *musb = __hci;
 	spin_lock_irqsave(&amp;musb->lock, flags);
 	musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB);
 	musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX);
 	musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX);
 	/*
 	 * The controller is gadget only, the state of the host mode IRQ bits is
 	 * undefined. Mask them to make sure that the musb driver core will
 	 * never see them set
 	 */
 	musb->int_usb &amp;= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME |
 	    MUSB_INTR_RESET | MUSB_INTR_SOF;
 	if (musb->int_usb || musb->int_tx || musb->int_rx)
 		retval = musb_interrupt(musb);
 	spin_unlock_irqrestore(&amp;musb->lock, flags);
 	return retval;
 }
    </programlisting>
    <para>
      Here the glue layer mostly has to read the relevant hardware
      registers and pass their values on to the controller driver which
      will handle the actual event that triggered the IRQ.
    </para>
    <para>
      The interrupt handler critical section is protected by the
      spin_lock_irqsave() and counterpart spin_unlock_irqrestore()
      functions (line 7 and 24 respectively), which prevent the
      interrupt handler code to be run by two different threads at the
      same time.
    </para>
    <para>
      Then the relevant interrupt registers are read (line 9 to 11):
    </para>
    <itemizedlist>
      <listitem>
        <para>
          MUSB_INTRUSB: indicates which USB interrupts are currently
          active,
        </para>
      </listitem>
      <listitem>
        <para>
          MUSB_INTRTX: indicates which of the interrupts for TX
          endpoints are currently active,
        </para>
      </listitem>
      <listitem>
        <para>
          MUSB_INTRRX: indicates which of the interrupts for TX
          endpoints are currently active.
        </para>
      </listitem>
    </itemizedlist>
    <para>
      Note that musb_readb() is used to read 8-bit registers at most,
      while musb_readw() allows us to read at most 16-bit registers.
      There are other functions that can be used depending on the size
      of your device registers. See musb_io.h for more information.
    </para>
    <para>
      Instruction on line 18 is another quirk specific to the JZ4740
      USB device controller, which will be discussed later in <link
      linkend="device-quirks">Chapter 5</link>.
    </para>
    <para>
      The glue layer still needs to register the IRQ handler though.
      Remember the instruction on line 14 of the init function:
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_musb_init(struct musb *musb)
 {
 	musb->isr = jz4740_musb_interrupt;
 	return 0;
 }
    </programlisting>
    <para>
      This instruction sets a pointer to the glue layer IRQ handler
      function, in order for the controller hardware to call the handler
      back when an IRQ comes from the controller hardware. The interrupt
      handler is now implemented and registered.
    </para>
  </chapter>
  <chapter id="device-platform-data">
    <title>Device Platform Data</title>
    <para>
      In order to write an MUSB glue layer, you need to have some data
      describing the hardware capabilities of your controller hardware,
      which is called the platform data.
    </para>
    <para>
      Platform data is specific to your hardware, though it may cover a
      broad range of devices, and is generally found somewhere in the
      arch/ directory, depending on your device architecture.
    </para>
    <para>
      For instance, platform data for the JZ4740 SoC is found in
      arch/mips/jz4740/platform.c. In the platform.c file each device of
      the JZ4740 SoC is described through a set of structures.
    </para>
    <para>
      Here is the part of arch/mips/jz4740/platform.c that covers the
      USB Device Controller (UDC):
    </para>
    <programlisting linenumbering="numbered">
 /* USB Device Controller */
 struct platform_device jz4740_udc_xceiv_device = {
 	.name = "usb_phy_gen_xceiv",
 	.id   = 0,
 };
 static struct resource jz4740_udc_resources[] = {
 	[0] = {
 		.start = JZ4740_UDC_BASE_ADDR,
 		.end   = JZ4740_UDC_BASE_ADDR + 0x10000 - 1,
 		.flags = IORESOURCE_MEM,
 	},
 	[1] = {
 		.start = JZ4740_IRQ_UDC,
 		.end   = JZ4740_IRQ_UDC,
 		.flags = IORESOURCE_IRQ,
 		.name  = "mc",
 	},
 };
 struct platform_device jz4740_udc_device = {
 	.name = "musb-jz4740",
 	.id   = -1,
 	.dev  = {
 		.dma_mask          = &amp;jz4740_udc_device.dev.coherent_dma_mask,
 		.coherent_dma_mask = DMA_BIT_MASK(32),
 	},
 	.num_resources = ARRAY_SIZE(jz4740_udc_resources),
 	.resource      = jz4740_udc_resources,
 };
    </programlisting>
    <para>
      The jz4740_udc_xceiv_device platform device structure (line 2)
      describes the UDC transceiver with a name and id number.
    </para>
    <para>
      At the time of this writing, note that
      &quot;usb_phy_gen_xceiv&quot; is the specific name to be used for
      all transceivers that are either built-in with reference USB IP or
      autonomous and doesn't require any PHY programming. You will need
      to set CONFIG_NOP_USB_XCEIV=y in the kernel configuration to make
      use of the corresponding transceiver driver. The id field could be
      set to -1 (equivalent to PLATFORM_DEVID_NONE), -2 (equivalent to
      PLATFORM_DEVID_AUTO) or start with 0 for the first device of this
      kind if we want a specific id number.
    </para>
    <para>
      The jz4740_udc_resources resource structure (line 7) defines the
      UDC registers base addresses.
    </para>
    <para>
      The first array (line 9 to 11) defines the UDC registers base
      memory addresses: start points to the first register memory
      address, end points to the last register memory address and the
      flags member defines the type of resource we are dealing with. So
      IORESOURCE_MEM is used to define the registers memory addresses.
      The second array (line 14 to 17) defines the UDC IRQ registers
      addresses. Since there is only one IRQ register available for the
      JZ4740 UDC, start and end point at the same address. The
      IORESOURCE_IRQ flag tells that we are dealing with IRQ resources,
      and the name &quot;mc&quot; is in fact hard-coded in the MUSB core
      in order for the controller driver to retrieve this IRQ resource
      by querying it by its name.
    </para>
    <para>
      Finally, the jz4740_udc_device platform device structure (line 21)
      describes the UDC itself.
    </para>
    <para>
      The &quot;musb-jz4740&quot; name (line 22) defines the MUSB
      driver that is used for this device; remember this is in fact
      the name that we used in the jz4740_driver platform driver
      structure in <link linkend="linux-musb-basics">Chapter
      2</link>. The id field (line 23) is set to -1 (equivalent to
      PLATFORM_DEVID_NONE) since we do not need an id for the device:
      the MUSB controller driver was already set to allocate an
      automatic id in <link linkend="linux-musb-basics">Chapter
      2</link>. In the dev field we care for DMA related information
      here. The dma_mask field (line 25) defines the width of the DMA
      mask that is going to be used, and coherent_dma_mask (line 26)
      has the same purpose but for the alloc_coherent DMA mappings: in
      both cases we are using a 32 bits mask. Then the resource field
      (line 29) is simply a pointer to the resource structure defined
      before, while the num_resources field (line 28) keeps track of
      the number of arrays defined in the resource structure (in this
      case there were two resource arrays defined before).
    </para>
    <para>
      With this quick overview of the UDC platform data at the arch/
      level now done, let's get back to the MUSB glue layer specific
      platform data in drivers/usb/musb/jz4740.c:
    </para>
    <programlisting linenumbering="numbered">
 static struct musb_hdrc_config jz4740_musb_config = {
 	/* Silicon does not implement USB OTG. */
 	.multipoint = 0,
 	/* Max EPs scanned, driver will decide which EP can be used. */
 	.num_eps    = 4,
 	/* RAMbits needed to configure EPs from table */
 	.ram_bits   = 9,
 	.fifo_cfg = jz4740_musb_fifo_cfg,
 	.fifo_cfg_size = ARRAY_SIZE(jz4740_musb_fifo_cfg),
 };
 static struct musb_hdrc_platform_data jz4740_musb_platform_data = {
 	.mode   = MUSB_PERIPHERAL,
 	.config = &amp;jz4740_musb_config,
 };
    </programlisting>
    <para>
      First the glue layer configures some aspects of the controller
      driver operation related to the controller hardware specifics.
      This is done through the jz4740_musb_config musb_hdrc_config
      structure.
    </para>
    <para>
      Defining the OTG capability of the controller hardware, the
      multipoint member (line 3) is set to 0 (equivalent to false)
      since the JZ4740 UDC is not OTG compatible. Then num_eps (line
      5) defines the number of USB endpoints of the controller
      hardware, including endpoint 0: here we have 3 endpoints +
      endpoint 0. Next is ram_bits (line 7) which is the width of the
      RAM address bus for the MUSB controller hardware. This
      information is needed when the controller driver cannot
      automatically configure endpoints by reading the relevant
      controller hardware registers. This issue will be discussed when
      we get to device quirks in <link linkend="device-quirks">Chapter
      5</link>. Last two fields (line 8 and 9) are also about device
      quirks: fifo_cfg points to the USB endpoints configuration table
      and fifo_cfg_size keeps track of the size of the number of
      entries in that configuration table. More on that later in <link
      linkend="device-quirks">Chapter 5</link>.
    </para>
    <para>
      Then this configuration is embedded inside
      jz4740_musb_platform_data musb_hdrc_platform_data structure (line
      11): config is a pointer to the configuration structure itself,
      and mode tells the controller driver if the controller hardware
      may be used as MUSB_HOST only, MUSB_PERIPHERAL only or MUSB_OTG
      which is a dual mode.
    </para>
    <para>
      Remember that jz4740_musb_platform_data is then used to convey
      platform data information as we have seen in the probe function
      in <link linkend="linux-musb-basics">Chapter 2</link>
    </para>
  </chapter>
  <chapter id="device-quirks">
    <title>Device Quirks</title>
    <para>
      Completing the platform data specific to your device, you may also
      need to write some code in the glue layer to work around some
      device specific limitations. These quirks may be due to some
      hardware bugs, or simply be the result of an incomplete
      implementation of the USB On-the-Go specification.
    </para>
    <para>
      The JZ4740 UDC exhibits such quirks, some of which we will discuss
      here for the sake of insight even though these might not be found
      in the controller hardware you are working on.
    </para>
    <para>
      Let's get back to the init function first:
    </para>
    <programlisting linenumbering="numbered">
 static int jz4740_musb_init(struct musb *musb)
 {
 	musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2);
 	if (!musb->xceiv) {
 		pr_err("HS UDC: no transceiver configured\n");
 		return -ENODEV;
 	}
 	/* Silicon does not implement ConfigData register.
 	 * Set dyn_fifo to avoid reading EP config from hardware.
 	 */
 	musb->dyn_fifo = true;
 	musb->isr = jz4740_musb_interrupt;
 	return 0;
 }
    </programlisting>
    <para>
      Instruction on line 12 helps the MUSB controller driver to work
      around the fact that the controller hardware is missing registers
      that are used for USB endpoints configuration.
    </para>
    <para>
      Without these registers, the controller driver is unable to read
      the endpoints configuration from the hardware, so we use line 12
      instruction to bypass reading the configuration from silicon, and
      rely on a hard-coded table that describes the endpoints
      configuration instead:
    </para>
    <programlisting linenumbering="numbered">
 static struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = {
 { .hw_ep_num = 1, .style = FIFO_TX, .maxpacket = 512, },
 { .hw_ep_num = 1, .style = FIFO_RX, .maxpacket = 512, },
 { .hw_ep_num = 2, .style = FIFO_TX, .maxpacket = 64, },
 };
    </programlisting>
    <para>
      Looking at the configuration table above, we see that each
      endpoints is described by three fields: hw_ep_num is the endpoint
      number, style is its direction (either FIFO_TX for the controller
      driver to send packets in the controller hardware, or FIFO_RX to
      receive packets from hardware), and maxpacket defines the maximum
      size of each data packet that can be transmitted over that
      endpoint. Reading from the table, the controller driver knows that
      endpoint 1 can be used to send and receive USB data packets of 512
      bytes at once (this is in fact a bulk in/out endpoint), and
      endpoint 2 can be used to send data packets of 64 bytes at once
      (this is in fact an interrupt endpoint).
    </para>
    <para>
      Note that there is no information about endpoint 0 here: that one
      is implemented by default in every silicon design, with a
      predefined configuration according to the USB specification. For
      more examples of endpoint configuration tables, see musb_core.c.
    </para>
    <para>
      Let's now get back to the interrupt handler function:
    </para>
    <programlisting linenumbering="numbered">
 static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci)
 {
 	unsigned long   flags;
 	irqreturn_t     retval = IRQ_NONE;
 	struct musb     *musb = __hci;
 	spin_lock_irqsave(&amp;musb->lock, flags);
 	musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB);
 	musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX);
 	musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX);
 	/*
 	 * The controller is gadget only, the state of the host mode IRQ bits is
 	 * undefined. Mask them to make sure that the musb driver core will
 	 * never see them set
 	 */
 	musb->int_usb &amp;= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME |
 	    MUSB_INTR_RESET | MUSB_INTR_SOF;
 	if (musb->int_usb || musb->int_tx || musb->int_rx)
 		retval = musb_interrupt(musb);
 	spin_unlock_irqrestore(&amp;musb->lock, flags);
 	return retval;
 }
    </programlisting>
    <para>
      Instruction on line 18 above is a way for the controller driver to
      work around the fact that some interrupt bits used for USB host
      mode operation are missing in the MUSB_INTRUSB register, thus left
      in an undefined hardware state, since this MUSB controller
      hardware is used in peripheral mode only. As a consequence, the
      glue layer masks these missing bits out to avoid parasite
      interrupts by doing a logical AND operation between the value read
      from MUSB_INTRUSB and the bits that are actually implemented in
      the register.
    </para>
    <para>
      These are only a couple of the quirks found in the JZ4740 USB
      device controller. Some others were directly addressed in the MUSB
      core since the fixes were generic enough to provide a better
      handling of the issues for others controller hardware eventually.
    </para>
  </chapter>
  <chapter id="conclusion">
    <title>Conclusion</title>
    <para>
      Writing a Linux MUSB glue layer should be a more accessible task,
      as this documentation tries to show the ins and outs of this
      exercise.
    </para>
    <para>
      The JZ4740 USB device controller being fairly simple, I hope its
      glue layer serves as a good example for the curious mind. Used
      with the current MUSB glue layers, this documentation should
      provide enough guidance to get started; should anything gets out
      of hand, the linux-usb mailing list archive is another helpful
      resource to browse through.
    </para>
  </chapter>
  <chapter id="acknowledgements">
    <title>Acknowledgements</title>
    <para>
      Many thanks to Lars-Peter Clausen and Maarten ter Huurne for
      answering my questions while I was writing the JZ4740 glue layer
      and for helping me out getting the code in good shape.
    </para>
    <para>
      I would also like to thank the Qi-Hardware community at large for
      its cheerful guidance and support.
    </para>
  </chapter>
  <chapter id="resources">
    <title>Resources</title>
    <para>
      USB Home Page:
      <ulink url="http://www.usb.org">http://www.usb.org</ulink>
    </para>
    <para>
      linux-usb Mailing List Archives:
      <ulink url="http://marc.info/?l=linux-usb">http://marc.info/?l=linux-usb</ulink>
    </para>
    <para>
      USB On-the-Go Basics:
      <ulink url="http://www.maximintegrated.com/app-notes/index.mvp/id/1822">http://www.maximintegrated.com/app-notes/index.mvp/id/1822</ulink>
    </para>
    <para>
      Writing USB Device Drivers:
      <ulink url="https://www.kernel.org/doc/htmldocs/writing_usb_driver/index.html">https://www.kernel.org/doc/htmldocs/writing_usb_driver/index.html</ulink>
    </para>
    <para>
      Texas Instruments USB Configuration Wiki Page:
      <ulink url="http://processors.wiki.ti.com/index.php/Usbgeneralpage">http://processors.wiki.ti.com/index.php/Usbgeneralpage</ulink>
    </para>
    <para>
      Analog Devices Blackfin MUSB Configuration:
      <ulink url="http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb">http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb</ulink>
    </para>
  </chapter>
 </book>
--- a/Documentation/DocBook/writing_usb_driver.tmpl
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -1,412 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
 	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
 <book id="USBDeviceDriver">
 <bookinfo>
  <title>Writing USB Device Drivers</title>
  <authorgroup>
   <author>
    <firstname>Greg</firstname>
    <surname>Kroah-Hartman</surname>
    <affiliation>
     <address>
      <email>greg@kroah.com</email>
     </address>
    </affiliation>
   </author>
  </authorgroup>
  <copyright>
   <year>2001-2002</year>
   <holder>Greg Kroah-Hartman</holder>
  </copyright>
  <legalnotice>
   <para>
     This documentation is free software; you can redistribute
     it and/or modify it under the terms of the GNU General Public
     License as published by the Free Software Foundation; either
     version 2 of the License, or (at your option) any later
     version.
   </para>
   <para>
     This program is distributed in the hope that it will be
     useful, but WITHOUT ANY WARRANTY; without even the implied
     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     See the GNU General Public License for more details.
   </para>
   <para>
     You should have received a copy of the GNU General Public
     License along with this program; if not, write to the Free
     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
     MA 02111-1307 USA
   </para>
   <para>
     For more details see the file COPYING in the source
     distribution of Linux.
   </para>
   <para>
     This documentation is based on an article published in 
     Linux Journal Magazine, October 2001, Issue 90.
   </para>
  </legalnotice>
 </bookinfo>
 <toc></toc>
  <chapter id="intro">
      <title>Introduction</title>
  <para>
      The Linux USB subsystem has grown from supporting only two different
      types of devices in the 2.2.7 kernel (mice and keyboards), to over 20
      different types of devices in the 2.4 kernel. Linux currently supports
      almost all USB class devices (standard types of devices like keyboards,
      mice, modems, printers and speakers) and an ever-growing number of
      vendor-specific devices (such as USB to serial converters, digital
      cameras, Ethernet devices and MP3 players). For a full list of the
      different USB devices currently supported, see Resources.
  </para>
  <para>
      The remaining kinds of USB devices that do not have support on Linux are
      almost all vendor-specific devices. Each vendor decides to implement a
      custom protocol to talk to their device, so a custom driver usually needs
      to be created. Some vendors are open with their USB protocols and help
      with the creation of Linux drivers, while others do not publish them, and
      developers are forced to reverse-engineer. See Resources for some links
      to handy reverse-engineering tools.
  </para>
  <para>
      Because each different protocol causes a new driver to be created, I have
      written a generic USB driver skeleton, modelled after the pci-skeleton.c
      file in the kernel source tree upon which many PCI network drivers have
      been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c
      in the kernel source tree. In this article I will walk through the basics
      of the skeleton driver, explaining the different pieces and what needs to
      be done to customize it to your specific device.
  </para>
  </chapter>
  <chapter id="basics">
      <title>Linux USB Basics</title>
  <para>
      If you are going to write a Linux USB driver, please become familiar with
      the USB protocol specification. It can be found, along with many other
      useful documents, at the USB home page (see Resources). An excellent
      introduction to the Linux USB subsystem can be found at the USB Working
      Devices List (see Resources). It explains how the Linux USB subsystem is
      structured and introduces the reader to the concept of USB urbs
      (USB Request Blocks), which are essential to USB drivers.
  </para>
  <para>
      The first thing a Linux USB driver needs to do is register itself with
      the Linux USB subsystem, giving it some information about which devices
      the driver supports and which functions to call when a device supported
      by the driver is inserted or removed from the system. All of this
      information is passed to the USB subsystem in the usb_driver structure.
      The skeleton driver declares a usb_driver as:
  </para>
  <programlisting>
 static struct usb_driver skel_driver = {
        .name        = "skeleton",
        .probe       = skel_probe,
        .disconnect  = skel_disconnect,
        .fops        = &amp;skel_fops,
        .minor       = USB_SKEL_MINOR_BASE,
        .id_table    = skel_table,
 };
  </programlisting>
  <para>
      The variable name is a string that describes the driver. It is used in
      informational messages printed to the system log. The probe and
      disconnect function pointers are called when a device that matches the
      information provided in the id_table variable is either seen or removed.
  </para>
  <para>
      The fops and minor variables are optional. Most USB drivers hook into
      another kernel subsystem, such as the SCSI, network or TTY subsystem.
      These types of drivers register themselves with the other kernel
      subsystem, and any user-space interactions are provided through that
      interface. But for drivers that do not have a matching kernel subsystem,
      such as MP3 players or scanners, a method of interacting with user space
      is needed. The USB subsystem provides a way to register a minor device
      number and a set of file_operations function pointers that enable this
      user-space interaction. The skeleton driver needs this kind of interface,
      so it provides a minor starting number and a pointer to its
      file_operations functions.
  </para>
  <para>
      The USB driver is then registered with a call to usb_register, usually in
      the driver's init function, as shown here:
  </para>
  <programlisting>
 static int __init usb_skel_init(void)
 {
        int result;
        /* register this driver with the USB subsystem */
        result = usb_register(&amp;skel_driver);
        if (result &lt; 0) {
                err(&quot;usb_register failed for the &quot;__FILE__ &quot;driver.&quot;
                    &quot;Error number %d&quot;, result);
                return -1;
        }
        return 0;
 }
 module_init(usb_skel_init);
  </programlisting>
  <para>
      When the driver is unloaded from the system, it needs to deregister
      itself with the USB subsystem. This is done with the usb_deregister
      function:
  </para>
  <programlisting>
 static void __exit usb_skel_exit(void)
 {
        /* deregister this driver with the USB subsystem */
        usb_deregister(&amp;skel_driver);
 }
 module_exit(usb_skel_exit);
  </programlisting>
  <para>
     To enable the linux-hotplug system to load the driver automatically when
     the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The
     following code tells the hotplug scripts that this module supports a
     single device with a specific vendor and product ID:
  </para>
  <programlisting>
 /* table of devices that work with this driver */
 static struct usb_device_id skel_table [] = {
        { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
        { }                      /* Terminating entry */
 };
 MODULE_DEVICE_TABLE (usb, skel_table);
  </programlisting>
  <para>
     There are other macros that can be used in describing a usb_device_id for
     drivers that support a whole class of USB drivers. See usb.h for more
     information on this.
  </para>
  </chapter>
  <chapter id="device">
      <title>Device operation</title>
  <para>
     When a device is plugged into the USB bus that matches the device ID
     pattern that your driver registered with the USB core, the probe function
     is called. The usb_device structure, interface number and the interface ID
     are passed to the function:
  </para>
  <programlisting>
 static int skel_probe(struct usb_interface *interface,
    const struct usb_device_id *id)
  </programlisting>
  <para>
     The driver now needs to verify that this device is actually one that it
     can accept. If so, it returns 0.
     If not, or if any error occurs during initialization, an errorcode
     (such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>)
     is returned from the probe function.
  </para>
  <para>
     In the skeleton driver, we determine what end points are marked as bulk-in
     and bulk-out. We create buffers to hold the data that will be sent and
     received from the device, and a USB urb to write data to the device is
     initialized.
  </para>
  <para>
     Conversely, when the device is removed from the USB bus, the disconnect
     function is called with the device pointer. The driver needs to clean any
     private data that has been allocated at this time and to shut down any
     pending urbs that are in the USB system.
  </para>
  <para>
     Now that the device is plugged into the system and the driver is bound to
     the device, any of the functions in the file_operations structure that
     were passed to the USB subsystem will be called from a user program trying
     to talk to the device. The first function called will be open, as the
     program tries to open the device for I/O. We increment our private usage
     count and save a pointer to our internal structure in the file
     structure. This is done so that future calls to file operations will
     enable the driver to determine which device the user is addressing.  All
     of this is done with the following code:
  </para>
  <programlisting>
 /* increment our usage count for the module */
 ++skel->open_count;
 /* save our object in the file's private structure */
 file->private_data = dev;
  </programlisting>
  <para>
     After the open function is called, the read and write functions are called
     to receive and send data to the device. In the skel_write function, we
     receive a pointer to some data that the user wants to send to the device
     and the size of the data. The function determines how much data it can
     send to the device based on the size of the write urb it has created (this
     size depends on the size of the bulk out end point that the device has).
     Then it copies the data from user space to kernel space, points the urb to
     the data and submits the urb to the USB subsystem.  This can be seen in
     the following code:
  </para>
  <programlisting>
 /* we can only write as much as 1 urb will hold */
 bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count;
 /* copy the data from user space into our urb */
 copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written);
 /* set up our urb */
 usb_fill_bulk_urb(skel->write_urb,
                  skel->dev,
                  usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr),
                  skel->write_urb->transfer_buffer,
                  bytes_written,
                  skel_write_bulk_callback,
                  skel);
 /* send the data out the bulk port */
 result = usb_submit_urb(skel->write_urb);
 if (result) {
        err(&quot;Failed submitting write urb, error %d&quot;, result);
 }
  </programlisting>
  <para>
     When the write urb is filled up with the proper information using the
     usb_fill_bulk_urb function, we point the urb's completion callback to call our
     own skel_write_bulk_callback function. This function is called when the
     urb is finished by the USB subsystem. The callback function is called in
     interrupt context, so caution must be taken not to do very much processing
     at that time. Our implementation of skel_write_bulk_callback merely
     reports if the urb was completed successfully or not and then returns.
  </para>
  <para>
     The read function works a bit differently from the write function in that
     we do not use an urb to transfer data from the device to the driver.
     Instead we call the usb_bulk_msg function, which can be used to send or
     receive data from a device without having to create urbs and handle
     urb completion callback functions. We call the usb_bulk_msg function,
     giving it a buffer into which to place any data received from the device
     and a timeout value. If the timeout period expires without receiving any
     data from the device, the function will fail and return an error message.
     This can be shown with the following code:
  </para>
  <programlisting>
 /* do an immediate bulk read to get data from the device */
 retval = usb_bulk_msg (skel->dev,
                       usb_rcvbulkpipe (skel->dev,
                       skel->bulk_in_endpointAddr),
                       skel->bulk_in_buffer,
                       skel->bulk_in_size,
                       &amp;count, HZ*10);
 /* if the read was successful, copy the data to user space */
 if (!retval) {
        if (copy_to_user (buffer, skel->bulk_in_buffer, count))
                retval = -EFAULT;
        else
                retval = count;
 }
  </programlisting>
  <para>
     The usb_bulk_msg function can be very useful for doing single reads or
     writes to a device; however, if you need to read or write constantly to a
     device, it is recommended to set up your own urbs and submit them to the
     USB subsystem.
  </para>
  <para>
     When the user program releases the file handle that it has been using to
     talk to the device, the release function in the driver is called. In this
     function we decrement our private usage count and wait for possible
     pending writes:
  </para>
  <programlisting>
 /* decrement our usage count for the device */
 --skel->open_count;
  </programlisting>
  <para>
     One of the more difficult problems that USB drivers must be able to handle
     smoothly is the fact that the USB device may be removed from the system at
     any point in time, even if a program is currently talking to it. It needs
     to be able to shut down any current reads and writes and notify the
     user-space programs that the device is no longer there. The following
     code (function <function>skel_delete</function>)
     is an example of how to do this: </para>
  <programlisting>
 static inline void skel_delete (struct usb_skel *dev)
 {
    kfree (dev->bulk_in_buffer);
    if (dev->bulk_out_buffer != NULL)
        usb_free_coherent (dev->udev, dev->bulk_out_size,
            dev->bulk_out_buffer,
            dev->write_urb->transfer_dma);
    usb_free_urb (dev->write_urb);
    kfree (dev);
 }
  </programlisting>
  <para>
     If a program currently has an open handle to the device, we reset the flag
     <literal>device_present</literal>. For
     every read, write, release and other functions that expect a device to be
     present, the driver first checks this flag to see if the device is
     still present. If not, it releases that the device has disappeared, and a
     -ENODEV error is returned to the user-space program. When the release
     function is eventually called, it determines if there is no device
     and if not, it does the cleanup that the skel_disconnect
     function normally does if there are no open files on the device (see
     Listing 5).
  </para>
  </chapter>
  <chapter id="iso">
      <title>Isochronous Data</title>
  <para>
     This usb-skeleton driver does not have any examples of interrupt or
     isochronous data being sent to or from the device. Interrupt data is sent
     almost exactly as bulk data is, with a few minor exceptions.  Isochronous
     data works differently with continuous streams of data being sent to or
     from the device. The audio and video camera drivers are very good examples
     of drivers that handle isochronous data and will be useful if you also
     need to do this.
  </para>
  </chapter>
  <chapter id="Conclusion">
      <title>Conclusion</title>
  <para>
     Writing Linux USB device drivers is not a difficult task as the
     usb-skeleton driver shows. This driver, combined with the other current
     USB drivers, should provide enough examples to help a beginning author
     create a working driver in a minimal amount of time. The linux-usb-devel
     mailing list archives also contain a lot of helpful information.
  </para>
  </chapter>
  <chapter id="resources">
      <title>Resources</title>
  <para>
     The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink>
  </para>
  <para>
     Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink>
  </para>
  <para>
     Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink>
  </para>
  <para>
     linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink>
  </para>
  <para>
     Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink>
  </para>
  <para>
     USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink>
  </para>
  </chapter>
 </book>
--- a/Documentation/EDID/edid.S
+++ b/Documentation/EDID/edid.S
@@ -59,9 +59,9 @@
 /* Fixed header pattern */
 header:		.byte	0x00,0xff,0xff,0xff,0xff,0xff,0xff,0x00
-mfg_id:		.word	swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3))
+mfg_id:		.hword	swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3))
-prod_code:	.word	0
+prod_code:	.hword	0
 /* Serial number. 32 bits, little endian. */
 serial_number:	.long	SERIAL
@@ -177,7 +177,7 @@ std_vres:	.byte	(XY_RATIO<<6)+VFREQ-60
 descriptor1:
 /* Pixel clock in 10 kHz units. (0.-655.35 MHz, little-endian) */
-clock:		.word	CLOCK/10
+clock:		.hword	CLOCK/10
 /* Horizontal active pixels 8 lsbits (0-4095) */
 x_act_lsb:	.byte	XPIX&0xff
--- a/Documentation/PCI/00-INDEX
+++ b/Documentation/PCI/00-INDEX
@@ -12,3 +12,13 @@ pci.txt
 	- info on the PCI subsystem for device driver authors
 pcieaer-howto.txt
 	- the PCI Express Advanced Error Reporting Driver Guide HOWTO
 endpoint/pci-endpoint.txt
 	- guide to add endpoint controller driver and endpoint function driver.
 endpoint/pci-endpoint-cfs.txt
 	- guide to use configfs to configure the PCI endpoint function.
 endpoint/pci-test-function.txt
 	- specification of *PCI test* function device.
 endpoint/pci-test-howto.txt
 	- userguide for PCI endpoint test function.
 endpoint/function/binding/
 	- binding documentation for PCI endpoint function
--- a/Documentation/PCI/endpoint/function/binding/pci-test.txt
+++ b/Documentation/PCI/endpoint/function/binding/pci-test.txt
@@ -0,0 +1,17 @@
 PCI TEST ENDPOINT FUNCTION
 name: Should be "pci_epf_test" to bind to the pci_epf_test driver.
 Configurable Fields:
 vendorid	 : should be 0x104c
 deviceid	 : should be 0xb500 for DRA74x and 0xb501 for DRA72x
 revid		 : don't care
 progif_code	 : don't care
 subclass_code	 : don't care
 baseclass_code	 : should be 0xff
 cache_line_size	 : don't care
 subsys_vendor_id : don't care
 subsys_id	 : don't care
 interrupt_pin	 : Should be 1 - INTA, 2 - INTB, 3 - INTC, 4 -INTD
 msi_interrupts	 : Should be 1 to 32 depending on the number of MSI interrupts
 		   to test
--- a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
+++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
@@ -0,0 +1,105 @@
                   CONFIGURING PCI ENDPOINT USING CONFIGFS
                    Kishon Vijay Abraham I <kishon@ti.com>
 The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
 PCI endpoint function and to bind the endpoint function
 with the endpoint controller. (For introducing other mechanisms to
 configure the PCI Endpoint Function refer to [1]).
 *) Mounting configfs
 The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
 directory. configfs can be mounted using the following command.
 	mount -t configfs none /sys/kernel/config
 *) Directory Structure
 The pci_ep configfs has two directories at its root: controllers and
 functions. Every EPC device present in the system will have an entry in
 the *controllers* directory and and every EPF driver present in the system
 will have an entry in the *functions* directory.
 /sys/kernel/config/pci_ep/
 	.. controllers/
 	.. functions/
 *) Creating EPF Device
 Every registered EPF driver will be listed in controllers directory. The
 entries corresponding to EPF driver will be created by the EPF core.
 /sys/kernel/config/pci_ep/functions/
 	.. <EPF Driver1>/
 		... <EPF Device 11>/
 		... <EPF Device 21>/
 	.. <EPF Driver2>/
 		... <EPF Device 12>/
 		... <EPF Device 22>/
 In order to create a <EPF device> of the type probed by <EPF Driver>, the
 user has to create a directory inside <EPF DriverN>.
 Every <EPF device> directory consists of the following entries that can be
 used to configure the standard configuration header of the endpoint function.
 (These entries are created by the framework when any new <EPF Device> is
 created)
 	.. <EPF Driver1>/
 		... <EPF Device 11>/
 			... vendorid
 			... deviceid
 			... revid
 			... progif_code
 			... subclass_code
 			... baseclass_code
 			... cache_line_size
 			... subsys_vendor_id
 			... subsys_id
 			... interrupt_pin
 *) EPC Device
 Every registered EPC device will be listed in controllers directory. The
 entries corresponding to EPC device will be created by the EPC core.
 /sys/kernel/config/pci_ep/controllers/
 	.. <EPC Device1>/
 		... <Symlink EPF Device11>/
 		... <Symlink EPF Device12>/
 		... start
 	.. <EPC Device2>/
 		... <Symlink EPF Device21>/
 		... <Symlink EPF Device22>/
 		... start
 The <EPC Device> directory will have a list of symbolic links to
 <EPF Device>. These symbolic links should be created by the user to
 represent the functions present in the endpoint device.
 The <EPC Device> directory will also have a *start* field. Once
 "1" is written to this field, the endpoint device will be ready to
 establish the link with the host. This is usually done after
 all the EPF devices are created and linked with the EPC device.
 			 | controllers/
 				| <Directory: EPC name>/
 					| <Symbolic Link: Function>
 					| start
 			 | functions/
 				| <Directory: EPF driver>/
 					| <Directory: EPF device>/
 						| vendorid
 						| deviceid
 						| revid
 						| progif_code
 						| subclass_code
 						| baseclass_code
 						| cache_line_size
 						| subsys_vendor_id
 						| subsys_id
 						| interrupt_pin
 						| function
 [1] -> Documentation/PCI/endpoint/pci-endpoint.txt
--- a/Documentation/PCI/endpoint/pci-endpoint.txt
+++ b/Documentation/PCI/endpoint/pci-endpoint.txt
@@ -0,0 +1,215 @@
 			    PCI ENDPOINT FRAMEWORK
 		    Kishon Vijay Abraham I <kishon@ti.com>
 This document is a guide to use the PCI Endpoint Framework in order to create
 endpoint controller driver, endpoint function driver, and using configfs
 interface to bind the function driver to the controller driver.
 1. Introduction
 Linux has a comprehensive PCI subsystem to support PCI controllers that
 operates in Root Complex mode. The subsystem has capability to scan PCI bus,
 assign memory resources and IRQ resources, load PCI driver (based on
 vendor ID, device ID), support other services like hot-plug, power management,
 advanced error reporting and virtual channels.
 However the PCI controller IP integrated in some SoCs is capable of operating
 either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
 add endpoint mode support in Linux. This will help to run Linux in an
 EP system which can have a wide variety of use cases from testing or
 validation, co-processor accelerator, etc.
 2. PCI Endpoint Core
 The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
 library, the Endpoint Function library, and the configfs layer to bind the
 endpoint function with the endpoint controller.
 2.1 PCI Endpoint Controller(EPC) Library
 The EPC library provides APIs to be used by the controller that can operate
 in endpoint mode. It also provides APIs to be used by function driver/library
 in order to implement a particular endpoint function.
 2.1.1 APIs for the PCI controller Driver
 This section lists the APIs that the PCI Endpoint core provides to be used
 by the PCI controller driver.
 *) devm_pci_epc_create()/pci_epc_create()
   The PCI controller driver should implement the following ops:
 	 * write_header: ops to populate configuration space header
 	 * set_bar: ops to configure the BAR
 	 * clear_bar: ops to reset the BAR
 	 * alloc_addr_space: ops to allocate in PCI controller address space
 	 * free_addr_space: ops to free the allocated address space
 	 * raise_irq: ops to raise a legacy or MSI interrupt
 	 * start: ops to start the PCI link
 	 * stop: ops to stop the PCI link
   The PCI controller driver can then create a new EPC device by invoking
   devm_pci_epc_create()/pci_epc_create().
 *) devm_pci_epc_destroy()/pci_epc_destroy()
   The PCI controller driver can destroy the EPC device created by either
   devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
   pci_epc_destroy().
 *) pci_epc_linkup()
   In order to notify all the function devices that the EPC device to which
   they are linked has established a link with the host, the PCI controller
   driver should invoke pci_epc_linkup().
 *) pci_epc_mem_init()
   Initialize the pci_epc_mem structure used for allocating EPC addr space.
 *) pci_epc_mem_exit()
   Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
 2.1.2 APIs for the PCI Endpoint Function Driver
 This section lists the APIs that the PCI Endpoint core provides to be used
 by the PCI endpoint function driver.
 *) pci_epc_write_header()
   The PCI endpoint function driver should use pci_epc_write_header() to
   write the standard configuration header to the endpoint controller.
 *) pci_epc_set_bar()
   The PCI endpoint function driver should use pci_epc_set_bar() to configure
   the Base Address Register in order for the host to assign PCI addr space.
   Register space of the function driver is usually configured
   using this API.
 *) pci_epc_clear_bar()
   The PCI endpoint function driver should use pci_epc_clear_bar() to reset
   the BAR.
 *) pci_epc_raise_irq()
   The PCI endpoint function driver should use pci_epc_raise_irq() to raise
   Legacy Interrupt or MSI Interrupt.
 *) pci_epc_mem_alloc_addr()
   The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
   allocate memory address from EPC addr space which is required to access
   RC's buffer
 *) pci_epc_mem_free_addr()
   The PCI endpoint function driver should use pci_epc_mem_free_addr() to
   free the memory space allocated using pci_epc_mem_alloc_addr().
 2.1.3 Other APIs
 There are other APIs provided by the EPC library. These are used for binding
 the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
 using these APIs.
 *) pci_epc_get()
   Get a reference to the PCI endpoint controller based on the device name of
   the controller.
 *) pci_epc_put()
   Release the reference to the PCI endpoint controller obtained using
   pci_epc_get()
 *) pci_epc_add_epf()
   Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
   can have up to 8 functions according to the specification.
 *) pci_epc_remove_epf()
   Remove the PCI endpoint function from PCI endpoint controller.
 *) pci_epc_start()
   The PCI endpoint function driver should invoke pci_epc_start() once it
   has configured the endpoint function and wants to start the PCI link.
 *) pci_epc_stop()
   The PCI endpoint function driver should invoke pci_epc_stop() to stop
   the PCI LINK.
 2.2 PCI Endpoint Function(EPF) Library
 The EPF library provides APIs to be used by the function driver and the EPC
 library to provide endpoint mode functionality.
 2.2.1 APIs for the PCI Endpoint Function Driver
 This section lists the APIs that the PCI Endpoint core provides to be used
 by the PCI endpoint function driver.
 *) pci_epf_register_driver()
   The PCI Endpoint Function driver should implement the following ops:
 	 * bind: ops to perform when a EPC device has been bound to EPF device
 	 * unbind: ops to perform when a binding has been lost between a EPC
 	   device and EPF device
 	 * linkup: ops to perform when the EPC device has established a
 	   connection with a host system
  The PCI Function driver can then register the PCI EPF driver by using
  pci_epf_register_driver().
 *) pci_epf_unregister_driver()
  The PCI Function driver can unregister the PCI EPF driver by using
  pci_epf_unregister_driver().
 *) pci_epf_alloc_space()
  The PCI Function driver can allocate space for a particular BAR using
  pci_epf_alloc_space().
 *) pci_epf_free_space()
  The PCI Function driver can free the allocated space
  (using pci_epf_alloc_space) by invoking pci_epf_free_space().
 2.2.2 APIs for the PCI Endpoint Controller Library
 This section lists the APIs that the PCI Endpoint core provides to be used
 by the PCI endpoint controller library.
 *) pci_epf_linkup()
   The PCI endpoint controller library invokes pci_epf_linkup() when the
   EPC device has established the connection to the host.
 2.2.2 Other APIs
 There are other APIs provided by the EPF library. These are used to notify
 the function driver when the EPF device is bound to the EPC device.
 pci-ep-cfs.c can be used as reference for using these APIs.
 *) pci_epf_create()
   Create a new PCI EPF device by passing the name of the PCI EPF device.
   This name will be used to bind the the EPF device to a EPF driver.
 *) pci_epf_destroy()
   Destroy the created PCI EPF device.
 *) pci_epf_bind()
   pci_epf_bind() should be invoked when the EPF device has been bound to
   a EPC device.
 *) pci_epf_unbind()
   pci_epf_unbind() should be invoked when the binding between EPC device
   and EPF device is lost.
--- a/Documentation/PCI/endpoint/pci-test-function.txt
+++ b/Documentation/PCI/endpoint/pci-test-function.txt
@@ -0,0 +1,66 @@
 				PCI TEST
 		    Kishon Vijay Abraham I <kishon@ti.com>
 Traditionally PCI RC has always been validated by using standard
 PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
 However with the addition of EP-core in linux kernel, it is possible
 to configure a PCI controller that can operate in EP mode to work as
 a test device.
 The PCI endpoint test device is a virtual device (defined in software)
 used to test the endpoint functionality and serve as a sample driver
 for other PCI endpoint devices (to use the EP framework).
 The PCI endpoint test device has the following registers:
 	1) PCI_ENDPOINT_TEST_MAGIC
 	2) PCI_ENDPOINT_TEST_COMMAND
 	3) PCI_ENDPOINT_TEST_STATUS
 	4) PCI_ENDPOINT_TEST_SRC_ADDR
 	5) PCI_ENDPOINT_TEST_DST_ADDR
 	6) PCI_ENDPOINT_TEST_SIZE
 	7) PCI_ENDPOINT_TEST_CHECKSUM
 *) PCI_ENDPOINT_TEST_MAGIC
 This register will be used to test BAR0. A known pattern will be written
 and read back from MAGIC register to verify BAR0.
 *) PCI_ENDPOINT_TEST_COMMAND:
 This register will be used by the host driver to indicate the function
 that the endpoint device must perform.
 Bitfield Description:
  Bit 0		: raise legacy IRQ
  Bit 1		: raise MSI IRQ
  Bit 2 - 7	: MSI interrupt number
  Bit 8		: read command (read data from RC buffer)
  Bit 9		: write command (write data to RC buffer)
  Bit 10	: copy command (copy data from one RC buffer to another
 		  RC buffer)
 *) PCI_ENDPOINT_TEST_STATUS
 This register reflects the status of the PCI endpoint device.
 Bitfield Description:
  Bit 0		: read success
  Bit 1		: read fail
  Bit 2		: write success
  Bit 3		: write fail
  Bit 4		: copy success
  Bit 5		: copy fail
  Bit 6		: IRQ raised
  Bit 7		: source address is invalid
  Bit 8		: destination address is invalid
 *) PCI_ENDPOINT_TEST_SRC_ADDR
 This register contains the source address (RC buffer address) for the
 COPY/READ command.
 *) PCI_ENDPOINT_TEST_DST_ADDR
 This register contains the destination address (RC buffer address) for
 the COPY/WRITE command.
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ b/Documentation/PCI/endpoint/pci-test-howto.txt
@@ -0,0 +1,179 @@
 			    PCI TEST USERGUIDE
 		    Kishon Vijay Abraham I <kishon@ti.com>
 This document is a guide to help users use pci-epf-test function driver
 and pci_endpoint_test host driver for testing PCI. The list of steps to
 be followed in the host side and EP side is given below.
 1. Endpoint Device
 1.1 Endpoint Controller Devices
 To find the list of endpoint controller devices in the system:
 	# ls /sys/class/pci_epc/
 	  51000000.pcie_ep
 If PCI_ENDPOINT_CONFIGFS is enabled
 	# ls /sys/kernel/config/pci_ep/controllers
 	  51000000.pcie_ep
 1.2 Endpoint Function Drivers
 To find the list of endpoint function drivers in the system:
 	# ls /sys/bus/pci-epf/drivers
 	  pci_epf_test
 If PCI_ENDPOINT_CONFIGFS is enabled
 	# ls /sys/kernel/config/pci_ep/functions
 	  pci_epf_test
 1.3 Creating pci-epf-test Device
 PCI endpoint function device can be created using the configfs. To create
 pci-epf-test device, the following commands can be used
 	# mount -t configfs none /sys/kernel/config
 	# cd /sys/kernel/config/pci_ep/
 	# mkdir functions/pci_epf_test/func1
 The "mkdir func1" above creates the pci-epf-test function device that will
 be probed by pci_epf_test driver.
 The PCI endpoint framework populates the directory with the following
 configurable fields.
 	# ls functions/pci_epf_test/func1
 	  baseclass_code	interrupt_pin	revid		subsys_vendor_id
 	  cache_line_size	msi_interrupts	subclass_code	vendorid
 	  deviceid          	progif_code	subsys_id
 The PCI endpoint function driver populates these entries with default values
 when the device is bound to the driver. The pci-epf-test driver populates
 vendorid with 0xffff and interrupt_pin with 0x0001
 	# cat functions/pci_epf_test/func1/vendorid
 	  0xffff
 	# cat functions/pci_epf_test/func1/interrupt_pin
 	  0x0001
 1.4 Configuring pci-epf-test Device
 The user can configure the pci-epf-test device using configfs entry. In order
 to change the vendorid and the number of MSI interrupts used by the function
 device, the following commands can be used.
 	# echo 0x104c > functions/pci_epf_test/func1/vendorid
 	# echo 0xb500 > functions/pci_epf_test/func1/deviceid
 	# echo 16 > functions/pci_epf_test/func1/msi_interrupts
 1.5 Binding pci-epf-test Device to EP Controller
 In order for the endpoint function device to be useful, it has to be bound to
 a PCI endpoint controller driver. Use the configfs to bind the function
 device to one of the controller driver present in the system.
 	# ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
 Once the above step is completed, the PCI endpoint is ready to establish a link
 with the host.
 1.6 Start the Link
 In order for the endpoint device to establish a link with the host, the _start_
 field should be populated with '1'.
 	# echo 1 > controllers/51000000.pcie_ep/start
 2. RootComplex Device
 2.1 lspci Output
 Note that the devices listed here correspond to the value populated in 1.4 above
 	00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
 	01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
 2.2 Using Endpoint Test function Device
 pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
 tests. Before pcitest.sh can be used pcitest.c should be compiled using the
 following commands.
 	cd <kernel-dir>
 	make headers_install ARCH=arm
 	arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest
 	cp pcitest  <rootfs>/usr/sbin/
 	cp tools/pci/pcitest.sh <rootfs>
 2.2.1 pcitest.sh Output
 	# ./pcitest.sh
 	BAR tests
 	BAR0:           OKAY
 	BAR1:           OKAY
 	BAR2:           OKAY
 	BAR3:           OKAY
 	BAR4:           NOT OKAY
 	BAR5:           NOT OKAY
 	Interrupt tests
 	LEGACY IRQ:     NOT OKAY
 	MSI1:           OKAY
 	MSI2:           OKAY
 	MSI3:           OKAY
 	MSI4:           OKAY
 	MSI5:           OKAY
 	MSI6:           OKAY
 	MSI7:           OKAY
 	MSI8:           OKAY
 	MSI9:           OKAY
 	MSI10:          OKAY
 	MSI11:          OKAY
 	MSI12:          OKAY
 	MSI13:          OKAY
 	MSI14:          OKAY
 	MSI15:          OKAY
 	MSI16:          OKAY
 	MSI17:          NOT OKAY
 	MSI18:          NOT OKAY
 	MSI19:          NOT OKAY
 	MSI20:          NOT OKAY
 	MSI21:          NOT OKAY
 	MSI22:          NOT OKAY
 	MSI23:          NOT OKAY
 	MSI24:          NOT OKAY
 	MSI25:          NOT OKAY
 	MSI26:          NOT OKAY
 	MSI27:          NOT OKAY
 	MSI28:          NOT OKAY
 	MSI29:          NOT OKAY
 	MSI30:          NOT OKAY
 	MSI31:          NOT OKAY
 	MSI32:          NOT OKAY
 	Read Tests
 	READ (      1 bytes):           OKAY
 	READ (   1024 bytes):           OKAY
 	READ (   1025 bytes):           OKAY
 	READ (1024000 bytes):           OKAY
 	READ (1024001 bytes):           OKAY
 	Write Tests
 	WRITE (      1 bytes):          OKAY
 	WRITE (   1024 bytes):          OKAY
 	WRITE (   1025 bytes):          OKAY
 	WRITE (1024000 bytes):          OKAY
 	WRITE (1024001 bytes):          OKAY
 	Copy Tests
 	COPY (      1 bytes):           OKAY
 	COPY (   1024 bytes):           OKAY
 	COPY (   1025 bytes):           OKAY
 	COPY (1024000 bytes):           OKAY
 	COPY (1024001 bytes):           OKAY
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -11,7 +11,7 @@
 Many PCI bus controllers are able to detect a variety of hardware
 PCI errors on the bus, such as parity errors on the data and address
-busses, as well as SERR and PERR errors.  Some of the more advanced
+buses, as well as SERR and PERR errors.  Some of the more advanced
 chipsets are able to deal with these errors; these include PCI-E chipsets,
 and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
 pSeries boxes. A typical action taken is to disconnect the affected device,
@@ -173,7 +173,7 @@ is STEP 6 (Permanent Failure).
 >>> a value of 0xff on read, and writes will be dropped. If more than
 >>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
 >>> assumes that the device driver has gone into an infinite loop
->>> and prints an error to syslog.  A reboot is then required to 
+>>> and prints an error to syslog.  A reboot is then required to
 >>> get the device working again.
 STEP 2: MMIO Enabled
@@ -231,14 +231,14 @@ proceeds to STEP 4 (Slot Reset)
 STEP 3: Link Reset
 ------------------
 The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a non-fatal error has been detected that can be
+and is done whenever a fatal error has been detected that can be
 "solved" by resetting the link.
 STEP 4: Slot Reset
 ------------------
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
-the platform will perform a slot reset on the requesting PCI device(s). 
+the platform will perform a slot reset on the requesting PCI device(s).
 The actual steps taken by a platform to perform a slot reset
 will be platform-dependent. Upon completion of slot reset, the
 platform will call the device slot_reset() callback.
@@ -258,7 +258,7 @@ configuration registers to initialize to their default conditions.
 For most PCI devices, a soft reset will be sufficient for recovery.
 Optional fundamental reset is provided to support a limited number
-of PCI Express PCI devices  for which a soft reset is not sufficient
+of PCI Express devices for which a soft reset is not sufficient
 for recovery.
 If the platform supports PCI hotplug, then the reset might be
@@ -303,7 +303,7 @@ driver performs device init only from PCI function 0:
 		Same as above.
 Drivers for PCI Express cards that require a fundamental reset must
-set the needs_freset bit in the pci_dev structure in their probe function.  
+set the needs_freset bit in the pci_dev structure in their probe function.
 For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
 PCI card types:
--- a/Documentation/PCI/pci-iov-howto.txt
+++ b/Documentation/PCI/pci-iov-howto.txt
@@ -68,6 +68,18 @@ To disable SR-IOV capability:
 	echo  0 > \
        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
 To enable auto probing VFs by a compatible driver on the host, run
 command below before enabling SR-IOV capabilities. This is the
 default behavior.
 	echo 1 > \
        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
 To disable auto probing VFs by a compatible driver on the host, run
 command below before enabling SR-IOV capabilities. Updating this
 entry will not affect VFs which are already probed.
 	echo  0 > \
        /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
 3.2 Usage example
 Following piece of code illustrates the usage of the SR-IOV API.
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
 rcubarrier.txt
 	- RCU and Unloadable Modules
 rculist_nulls.txt
-	- RCU list primitives for use with SLAB_DESTROY_BY_RCU
+	- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
 rcuref.txt
 	- Reference-count design for elements of lists/arrays protected by RCU
 rcu.txt
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -19,6 +19,8 @@ to each other.
 	The <tt>rcu_state</tt> Structure</a>
 <li>	<a href="#The rcu_node Structure">
 	The <tt>rcu_node</tt> Structure</a>
 <li>	<a href="#The rcu_segcblist Structure">
 	The <tt>rcu_segcblist</tt> Structure</a>
 <li>	<a href="#The rcu_data Structure">
 	The <tt>rcu_data</tt> Structure</a>
 <li>	<a href="#The rcu_dynticks Structure">
@@ -841,6 +843,134 @@ for lockdep lock-class names.
 Finally, lines&nbsp;64-66 produce an error if the maximum number of
 CPUs is too large for the specified fanout.
 <h3><a name="The rcu_segcblist Structure">
 The <tt>rcu_segcblist</tt> Structure</a></h3>
 The <tt>rcu_segcblist</tt> structure maintains a segmented list of
 callbacks as follows:
 <pre>
 1 #define RCU_DONE_TAIL        0
 2 #define RCU_WAIT_TAIL        1
 3 #define RCU_NEXT_READY_TAIL  2
 4 #define RCU_NEXT_TAIL        3
 5 #define RCU_CBLIST_NSEGS     4
 6
 7 struct rcu_segcblist {
 8   struct rcu_head *head;
 9   struct rcu_head **tails[RCU_CBLIST_NSEGS];
 10   unsigned long gp_seq[RCU_CBLIST_NSEGS];
 11   long len;
 12   long len_lazy;
 13 };
 </pre>
 <p>
 The segments are as follows:
 <ol>
 <li>	<tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
 	These callbacks are ready to be invoked.
 <li>	<tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
 	current grace period.
 	Note that different CPUs can have different ideas about which
 	grace period is current, hence the <tt>-&gt;gp_seq</tt> field.
 <li>	<tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
 	grace period to start.
 <li>	<tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
 	associated with a grace period.
 </ol>
 <p>
 The <tt>-&gt;head</tt> pointer references the first callback or
 is <tt>NULL</tt> if the list contains no callbacks (which is
 <i>not</i> the same as being empty).
 Each element of the <tt>-&gt;tails[]</tt> array references the
 <tt>-&gt;next</tt> pointer of the last callback in the corresponding
 segment of the list, or the list's <tt>-&gt;head</tt> pointer if
 that segment and all previous segments are empty.
 If the corresponding segment is empty but some previous segment is
 not empty, then the array element is identical to its predecessor.
 Older callbacks are closer to the head of the list, and new callbacks
 are added at the tail.
 This relationship between the <tt>-&gt;head</tt> pointer, the
 <tt>-&gt;tails[]</tt> array, and the callbacks is shown in this
 diagram:
 </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
 </p><p>In this figure, the <tt>-&gt;head</tt> pointer references the
 first
 RCU callback in the list.
 The <tt>-&gt;tails[RCU_DONE_TAIL]</tt> array element references
 the <tt>-&gt;head</tt> pointer itself, indicating that none
 of the callbacks is ready to invoke.
 The <tt>-&gt;tails[RCU_WAIT_TAIL]</tt> array element references callback
 CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
 CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period,
 give or take possible disagreements about exactly which grace period
 is the current one.
 The <tt>-&gt;tails[RCU_NEXT_READY_TAIL]</tt> array element
 references the same RCU callback that <tt>-&gt;tails[RCU_WAIT_TAIL]</tt>
 does, which indicates that there are no callbacks waiting on the next
 RCU grace period.
 The <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element references
 CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
 remaining RCU callbacks have not yet been assigned to an RCU grace
 period.
 Note that the <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element
 always references the last RCU callback's <tt>-&gt;next</tt> pointer
 unless the callback list is empty, in which case it references
 the <tt>-&gt;head</tt> pointer.
 <p>
 There is one additional important special case for the
 <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
 when this list is <i>disabled</i>.
 Lists are disabled when the corresponding CPU is offline or when
 the corresponding CPU's callbacks are offloaded to a kthread,
 both of which are described elsewhere.
 </p><p>CPUs advance their callbacks from the
 <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
 <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
 as grace periods advance.
 </p><p>The <tt>-&gt;gp_seq[]</tt> array records grace-period
 numbers corresponding to the list segments.
 This is what allows different CPUs to have different ideas as to
 which is the current grace period while still avoiding premature
 invocation of their callbacks.
 In particular, this allows CPUs that go idle for extended periods
 to determine which of their callbacks are ready to be invoked after
 reawakening.
 </p><p>The <tt>-&gt;len</tt> counter contains the number of
 callbacks in <tt>-&gt;head</tt>, and the
 <tt>-&gt;len_lazy</tt> contains the number of those callbacks that
 are known to only free memory, and whose invocation can therefore
 be safely deferred.
 <p><b>Important note</b>: It is the <tt>-&gt;len</tt> field that
 determines whether or not there are callbacks associated with
 this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
 pointer.
 The reason for this is that all the ready-to-invoke callbacks
 (that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
 all at once at callback-invocation time.
 If callback invocation must be postponed, for example, because a
 high-priority process just woke up on this CPU, then the remaining
 callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
 Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
 are adjusted after the corresponding callbacks have been invoked, and so
 again it is the <tt>-&gt;len</tt> count that accurately reflects whether
 or not there are callbacks associated with this <tt>rcu_segcblist</tt>
 structure.
 Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
 the use of appropriate synchronization, for example, memory barriers.
 This synchronization can be a bit subtle, particularly in the case
 of <tt>rcu_barrier()</tt>.
 <h3><a name="The rcu_data Structure">
 The <tt>rcu_data</tt> Structure</a></h3>
@@ -983,62 +1113,18 @@ choice.
 as follows:
 <pre>
- 1 struct rcu_head *nxtlist;
+ 1 struct rcu_segcblist cblist;
- 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ 2 long qlen_last_fqs_check;
- 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ 3 unsigned long n_cbs_invoked;
- 4 long qlen_lazy;
+ 4 unsigned long n_nocbs_invoked;
- 5 long qlen;
+ 5 unsigned long n_cbs_orphaned;
- 6 long qlen_last_fqs_check;
+ 6 unsigned long n_cbs_adopted;
 7 unsigned long n_force_qs_snap;
- 8 unsigned long n_cbs_invoked;
+ 8 long blimit;
 9 unsigned long n_cbs_orphaned;
 10 unsigned long n_cbs_adopted;
 11 long blimit;
 </pre>
-<p>The <tt>-&gt;nxtlist</tt> pointer and the
+<p>The <tt>-&gt;cblist</tt> structure is the segmented callback list
-<tt>-&gt;nxttail[]</tt> array form a four-segment list with
+described earlier.
 older callbacks near the head and newer ones near the tail.
 Each segment contains callbacks with the corresponding relationship
 to the current grace period.
 The pointer out of the end of each of the four segments is referenced
 by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
 <tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
 <tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
 <tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
 grace period), and
 <tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
 with a specific grace period)
 respectively, as shown in the following figure.
 </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
 </p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
 first
 RCU callback in the list.
 The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
 the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
 of the callbacks is ready to invoke.
 The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
 CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
 CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
 The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
 references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
 does, which indicates that there are no callbacks waiting on the next
 RCU grace period.
 The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
 CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
 remaining RCU callbacks have not yet been assigned to an RCU grace
 period.
 Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
 always references the last RCU callback's <tt>-&gt;next</tt> pointer
 unless the callback list is empty, in which case it references
 the <tt>-&gt;nxtlist</tt> pointer.
 </p><p>CPUs advance their callbacks from the
 <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
 <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
 as grace periods advance.
 The CPU advances the callbacks in its <tt>rcu_data</tt> structure
 whenever it notices that another RCU grace period has completed.
 The CPU detects the completion of an RCU grace period by noticing
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
 <tt>-&gt;completed</tt> field is updated at the end of each
 grace period.
-</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period
+<p>
 numbers corresponding to the list segments.
 This allows CPUs that go idle for extended periods to determine
 which of their callbacks are ready to be invoked after reawakening.
 </p><p>The <tt>-&gt;qlen</tt> counter contains the number of
 callbacks in <tt>-&gt;nxtlist</tt>, and the
 <tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
 are known to only free memory, and whose invocation can therefore
 be safely deferred.
 The <tt>-&gt;qlen_last_fqs_check</tt> and
 <tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
 states from <tt>call_rcu()</tt> and friends when callback
@@ -1069,6 +1146,10 @@ lists grow excessively long.
 fields count the number of callbacks invoked,
 sent to other CPUs when this CPU goes offline,
 and received from other CPUs when those other CPUs go offline.
 The <tt>-&gt;n_nocbs_invoked</tt> is used when the CPU's callbacks
 are offloaded to a kthread.
 <p>
 Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
 RCU callbacks that may be invoked at a given time.
@@ -1104,6 +1185,9 @@ Its fields are as follows:
  1   int dynticks_nesting;
  2   int dynticks_nmi_nesting;
  3   atomic_t dynticks;
  4   bool rcu_need_heavy_qs;
  5   unsigned long rcu_qs_ctr;
  6   bool rcu_urgent_qs;
 </pre>
 <p>The <tt>-&gt;dynticks_nesting</tt> field counts the
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
 field, except that NMIs that interrupt non-dyntick-idle execution
 are not counted.
-</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding
+</p><p>The <tt>-&gt;dynticks</tt> field counts the corresponding
 CPU's transitions to and from dyntick-idle mode, so that this counter
 has an even value when the CPU is in dyntick-idle mode and an odd
 value otherwise.
 </p><p>The <tt>-&gt;rcu_need_heavy_qs</tt> field is used
 to record the fact that the RCU core code would really like to
 see a quiescent state from the corresponding CPU, so much so that
 it is willing to call for heavy-weight dyntick-counter operations.
 This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
 code, which provide a momentary idle sojourn in response.
 </p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
 quiescent states from <tt>cond_resched()</tt>.
 Because <tt>cond_resched()</tt> can execute quite frequently, this
 must be quite lightweight, as in a non-atomic increment of this
 per-CPU field.
 </p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
 the fact that the RCU core code would really like to see a quiescent
 state from the corresponding CPU, with the various other fields indicating
 just how badly RCU wants this quiescent state.
 This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
 code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
 in response.
 <table>
 <tr><th>&nbsp;</th></tr>
 <tr><th align="left">Quick Quiz:</th></tr>
--- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -19,7 +19,7 @@
   id="svg2"
   version="1.1"
   inkscape:version="0.48.4 r9939"
-   sodipodi:docname="nxtlist.fig">
+   sodipodi:docname="segcblist.svg">
  <metadata
     id="metadata94">
    <rdf:RDF>
@@ -28,7 +28,7 @@
        <dc:format>image/svg+xml</dc:format>
        <dc:type
           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
+        <dc:title />
      </cc:Work>
    </rdf:RDF>
  </metadata>
@@ -241,61 +241,51 @@
       xml:space="preserve"
       x="225"
       y="675"
       fill="#000000"
       font-family="Courier"
       font-style="normal"
       font-weight="bold"
       font-size="324"
-       text-anchor="start"
+       id="text64"
-       id="text64">nxtlist</text>
+       style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;head</text>
    <!-- Text -->
    <text
       xml:space="preserve"
       x="225"
       y="1800"
       fill="#000000"
       font-family="Courier"
       font-style="normal"
       font-weight="bold"
       font-size="324"
-       text-anchor="start"
+       id="text66"
-       id="text66">nxttail[RCU_DONE_TAIL]</text>
+       style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_DONE_TAIL]</text>
    <!-- Text -->
    <text
       xml:space="preserve"
       x="225"
       y="2925"
       fill="#000000"
       font-family="Courier"
       font-style="normal"
       font-weight="bold"
       font-size="324"
-       text-anchor="start"
+       id="text68"
-       id="text68">nxttail[RCU_WAIT_TAIL]</text>
+       style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_WAIT_TAIL]</text>
    <!-- Text -->
    <text
       xml:space="preserve"
       x="225"
       y="4050"
       fill="#000000"
       font-family="Courier"
       font-style="normal"
       font-weight="bold"
       font-size="324"
-       text-anchor="start"
+       id="text70"
-       id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
+       style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_READY_TAIL]</text>
    <!-- Text -->
    <text
       xml:space="preserve"
       x="225"
       y="5175"
       fill="#000000"
       font-family="Courier"
       font-style="normal"
       font-weight="bold"
       font-size="324"
-       text-anchor="start"
+       id="text72"
-       id="text72">nxttail[RCU_NEXT_TAIL]</text>
+       style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_TAIL]</text>
    <!-- Text -->
    <text
       xml:space="preserve"
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
 	Funnel locking and wait/wakeup</a>.
 <li>	<a href="#Use of Workqueues">Use of Workqueues</a>.
 <li>	<a href="#Stall Warnings">Stall warnings</a>.
 <li>	<a href="#Mid-Boot Operation">Mid-boot operation</a>.
 </ol>
 <h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
 In earlier implementations, the task requesting the expedited
 grace period also drove it to completion.
 This straightforward approach had the disadvantage of needing to
-account for signals sent to user tasks,
+account for POSIX signals sent to user tasks,
 so more recent implemementations use the Linux kernel's
 <a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
 processing, but the task reaching the top of the funnel lock
 does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
 so that a workqueue kthread does the actual grace-period processing.
-Because workqueue kthreads do not accept signals, grace-period-wait
+Because workqueue kthreads do not accept POSIX signals, grace-period-wait
-processing need not allow for signals.
+processing need not allow for POSIX signals.
 In addition, this approach allows wakeups for the previous expedited
 grace period to be overlapped with processing for the next expedited
@@ -586,6 +587,46 @@ blocking the current grace period are printed.
 Each stall warning results in another pass through the loop, but the
 second and subsequent passes use longer stall times.
 <h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
 <p>
 The use of workqueues has the advantage that the expedited
 grace-period code need not worry about POSIX signals.
 Unfortunately, it has the
 corresponding disadvantage that workqueues cannot be used until
 they are initialized, which does not happen until some time after
 the scheduler spawns the first task.
 Given that there are parts of the kernel that really do want to
 execute grace periods during this mid-boot &ldquo;dead zone&rdquo;,
 expedited grace periods must do something else during thie time.
 <p>
 What they do is to fall back to the old practice of requiring that the
 requesting task drive the expedited grace period, as was the case
 before the use of workqueues.
 However, the requesting task is only required to drive the grace period
 during the mid-boot dead zone.
 Before mid-boot, a synchronous grace period is a no-op.
 Some time after mid-boot, workqueues are used.
 <p>
 Non-expedited non-SRCU synchronous grace periods must also operate
 normally during mid-boot.
 This is handled by causing non-expedited grace periods to take the
 expedited code path during mid-boot.
 <p>
 The current code assumes that there are no POSIX signals during
 the mid-boot dead zone.
 However, if an overwhelming need for POSIX signals somehow arises,
 appropriate adjustments can be made to the expedited stall-warning code.
 One such adjustment would reinstate the pre-workqueue stall-warning
 checks, but only during the mid-boot dead zone.
 <p>
 With this refinement, synchronous grace periods can now be used from
 task context pretty much any time during the life of the kernel.
 <h3><a name="Summary">
 Summary</a></h3>
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -659,8 +659,9 @@ systems with more than one CPU:
 	In other words, a given instance of <tt>synchronize_rcu()</tt>
 	can avoid waiting on a given RCU read-side critical section only
 	if it can prove that <tt>synchronize_rcu()</tt> started first.
 	</font>
-	<p>
+	<p><font color="ffffff">
 	A related question is &ldquo;When <tt>rcu_read_lock()</tt>
 	doesn't generate any code, why does it matter how it relates
 	to a grace period?&rdquo;
@@ -675,8 +676,9 @@ systems with more than one CPU:
 	within the critical section, in which case none of the accesses
 	within the critical section may observe the effects of any
 	access following the grace period.
 	</font>
-	<p>
+	<p><font color="ffffff">
 	As of late 2016, mathematical models of RCU take this
 	viewpoint, for example, see slides&nbsp;62 and&nbsp;63
 	of the
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress.
 In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
 is permitted to impose modest degradation of real-time latency
 on non-idle online CPUs.
-That said, it will likely be necessary to take further steps to reduce this
+Here, &ldquo;modest&rdquo; means roughly the same latency
-degradation, hopefully to roughly that of a scheduling-clock interrupt.
+degradation as a scheduling-clock interrupt.
 <p>
 There are a number of situations where even
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods,
 but it is also the driving force behind the checks for large numbers
 of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
 Finally, high update rates should not delay RCU read-side critical
-sections, although some read-side delays can occur when using
+sections, although some small read-side delays can occur when using
 <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
-of <tt>try_stop_cpus()</tt>.
+of <tt>smp_call_function_single()</tt>.
 (In the future, <tt>synchronize_rcu_expedited()</tt> will be
 converted to use lighter-weight inter-processor interrupts (IPIs),
 but this will still disturb readers, though to a much smaller degree.)
 <p>
 Although all three of these corner cases were understood in the early
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>.
 <p>
 Although <tt>call_rcu()</tt> may be invoked at any
 time during boot, callbacks are not guaranteed to be invoked until after
-the scheduler is fully up and running.
+all of RCU's kthreads have been spawned, which occurs at
 <tt>early_initcall()</tt> time.
 This delay in callback invocation is due to the fact that RCU does not
 invoke callbacks until it is fully initialized, and this full initialization
 cannot occur until after the scheduler has initialized itself to the
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke.
 Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
 <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
 (<a href="#Bottom-Half Flavor">discussed below</a>),
-and
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
-<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+<tt>synchronize_rcu_expedited()</tt>,
 <tt>synchronize_rcu_bh_expedited()</tt>, and
 <tt>synchronize_sched_expedited()</tt>
 will all operate normally
 during very early boot, the reason being that there is only one CPU
 and preemption is disabled.
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can
 be a no-op.
 <p>
-Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
+However, once the scheduler has spawned its first kthread, this early
-continue to operate normally through the remainder of boot, courtesy
+boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
-of the fact that preemption is disabled across their RCU read-side
+<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
-critical sections and also courtesy of the fact that there is still
+kernels.
-only one CPU.
+The reason is that an RCU read-side critical section might be preempted,
-However, once the scheduler starts initializing, preemption is enabled.
+which means that a subsequent <tt>synchronize_rcu()</tt> really does have
-There is still only a single CPU, but the fact that preemption is enabled
+to wait for something, as opposed to simply returning immediately.
-means that the no-op implementation of <tt>synchronize_rcu()</tt> no
+Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
-longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
+its kthreads are spawned, which doesn't happen until some time during
-Therefore, as soon as the scheduler starts initializing, the early-boot
+<tt>early_initcalls()</tt> time.
-fastpath is disabled.
+But this is no excuse:  RCU is nevertheless required to correctly handle
-This means that <tt>synchronize_rcu()</tt> switches to its runtime
+synchronous grace periods during this time period.
-mode of operation where it posts callbacks, which in turn means that
+Once all of its kthreads are up and running, RCU starts running
-any call to <tt>synchronize_rcu()</tt> will block until the corresponding
+normally.
 callback is invoked.
 Unfortunately, the callback cannot be invoked until RCU's runtime
 grace-period machinery is up and running, which cannot happen until
 the scheduler has initialized itself sufficiently to allow RCU's
 kthreads to be spawned.
 Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
 initialization can result in deadlock.
 <table>
 <tr><th>&nbsp;</th></tr>
 <tr><th align="left">Quick Quiz:</th></tr>
 <tr><td>
-	So what happens with <tt>synchronize_rcu()</tt> during
+	How can RCU possibly handle grace periods before all of its
-	scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+	kthreads have been spawned???
 	kernels?
 </td></tr>
 <tr><th align="left">Answer:</th></tr>
 <tr><td bgcolor="#ffffff"><font color="ffffff">
-	In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+	Very carefully!
-	maps directly to <tt>synchronize_sched()</tt>.
+	</font>
-	Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+
-	boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+	<p><font color="ffffff">
-	However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+	During the &ldquo;dead zone&rdquo; between the time that the
-	so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+	scheduler spawns the first task and the time that all of RCU's
-	during scheduler initialization.
+	kthreads have been spawned, all synchronous grace periods are
 	handled by the expedited grace-period mechanism.
 	At runtime, this expedited mechanism relies on workqueues, but
 	during the dead zone the requesting task itself drives the
 	desired expedited grace period.
 	Because dead-zone execution takes place within task context,
 	everything works.
 	Once the dead zone ends, expedited grace periods go back to
 	using workqueues, as is required to avoid problems that would
 	otherwise occur when a user task received a POSIX signal while
 	driving an expedited grace period.
 	</font>
 	<p><font color="ffffff">
 	And yes, this does mean that it is unhelpful to send POSIX
 	signals to random tasks between the time that the scheduler
 	spawns its first kthread and the time that RCU's kthreads
 	have all been spawned.
 	If there ever turns out to be a good reason for sending POSIX
 	signals during that time, appropriate adjustments will be made.
 	(If it turns out that POSIX signals are sent during this time for
 	no good reason, other adjustments will be made, appropriate
 	or otherwise.)
 </font></td></tr>
 <tr><td>&nbsp;</td></tr>
 </table>
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
 The need for <tt>rcu_barrier()</tt> for module unloading became
 apparent later.
 <p>
 <b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
 repeat, <i>not</i>, obligated to wait for a grace period.
 It is instead only required to wait for RCU callbacks that have
 already been posted.
 Therefore, if there are no RCU callbacks posted anywhere in the system,
 <tt>rcu_barrier()</tt> is within its rights to return immediately.
 Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
 necessarily need to wait for a grace period.
 <table>
 <tr><th>&nbsp;</th></tr>
 <tr><th align="left">Quick Quiz:</th></tr>
 <tr><td>
 	Wait a minute!
 	Each RCU callbacks must wait for a grace period to complete,
 	and <tt>rcu_barrier()</tt> must wait for each pre-existing
 	callback to be invoked.
 	Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
 	a full grace period if there is even one callback posted anywhere
 	in the system?
 </td></tr>
 <tr><th align="left">Answer:</th></tr>
 <tr><td bgcolor="#ffffff"><font color="ffffff">
 	Absolutely not!!!
 	</font>
 	<p><font color="ffffff">
 	Yes, each RCU callbacks must wait for a grace period to complete,
 	but it might well be partly (or even completely) finished waiting
 	by the time <tt>rcu_barrier()</tt> is invoked.
 	In that case, <tt>rcu_barrier()</tt> need only wait for the
 	remaining portion of the grace period to elapse.
 	So even if there are quite a few callbacks posted,
 	<tt>rcu_barrier()</tt> might well return quite quickly.
 	</font>
 	<p><font color="ffffff">
 	So if you need to wait for a grace period as well as for all
 	pre-existing callbacks, you will need to invoke both
 	<tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
 	If latency is a concern, you can always use workqueues
 	to invoke them concurrently.
 </font></td></tr>
 <tr><td>&nbsp;</td></tr>
 </table>
 <h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
 <p>
 The Linux kernel supports CPU hotplug, which means that CPUs
 can come and go.
-It is of course illegal to use any RCU API member from an offline CPU.
+It is of course illegal to use any RCU API member from an offline CPU,
 with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
 critical sections.
 This requirement was present from day one in DYNIX/ptx, but
 on the other hand, the Linux kernel's CPU-hotplug implementation
 is &ldquo;interesting.&rdquo;
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
 are used to allow the various kernel subsystems (including RCU)
 to respond appropriately to a given CPU-hotplug operation.
 Most RCU operations may be invoked from CPU-hotplug notifiers,
-including even normal synchronous grace-period operations
+including even synchronous grace-period operations such as
-such as <tt>synchronize_rcu()</tt>.
+<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
 However, expedited grace-period operations such as
 <tt>synchronize_rcu_expedited()</tt> are not supported,
 due to the fact that current implementations block CPU-hotplug
 operations, which could result in deadlock.
 <p>
-In addition, all-callback-wait operations such as
+However, all-callback-wait operations such as
 <tt>rcu_barrier()</tt> are also not supported, due to the
 fact that there are phases of CPU-hotplug operations where
 the outgoing CPU's callbacks will not be invoked until after
 the CPU-hotplug operation ends, which could also result in deadlock.
 Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
 during its execution, which results in another type of deadlock
 when invoked from a CPU-hotplug notifier.
 <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
@@ -2863,6 +2927,27 @@ It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
 API, which, in combination with <tt>srcu_read_unlock()</tt>,
 guarantees a full memory barrier.
 <p>
 Also unlike other RCU flavors, SRCU's callbacks-wait function
 <tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
 though this is not necessarily a good idea.
 The reason that this is possible is that SRCU is insensitive
 to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
 need not exclude CPU-hotplug operations.
 <p>
 As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
 a locking bottleneck present in prior kernel versions.
 Although this will allow users to put much heavier stress on
 <tt>call_srcu()</tt>, it is important to note that SRCU does not
 yet take any special steps to deal with callback flooding.
 So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
 you are probably totally OK, but if you intend to post (say) 1,000,000
 SRCU callbacks per second per CPU, please run some tests first.
 SRCU just might need a few adjustment to deal with that sort of load.
 Of course, your mileage may vary based on the speed of your CPUs and
 the size of your memory.
 <p>
 The
 <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.
 <p>
 RCU disables CPU hotplug in a few places, perhaps most notably in the
-expedited grace-period and <tt>rcu_barrier()</tt> operations.
+<tt>rcu_barrier()</tt> operations.
-If there is a strong reason to use expedited grace periods in CPU-hotplug
+If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
 notifiers, it will be necessary to avoid disabling CPU hotplug.
 This would introduce some complexity, so there had better be a <i>very</i>
 good reason.
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering
 this article human readable, and to Michelle Rankin for her support
 of this effort.
 Other contributions are acknowledged in the Linux kernel's git archive.
 The cartoon is copyright (c) 2013 by Melissa Broussard,
 and is provided
 under the terms of the Creative Commons Attribution-Share Alike 3.0
 United States license.
 </body></html>
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -138,6 +138,15 @@ o	Be very careful about comparing pointers obtained from
 		This sort of comparison occurs frequently when scanning
 		RCU-protected circular linked lists.
 		Note that if checks for being within an RCU read-side
 		critical section are not required and the pointer is never
 		dereferenced, rcu_access_pointer() should be used in place
 		of rcu_dereference(). The rcu_access_pointer() primitive
 		does not require an enclosing read-side critical section,
 		and also omits the smp_read_barrier_depends() included in
 		rcu_dereference(), which in turn should provide a small
 		performance gain in some CPUs (e.g., the DEC Alpha).
 	o	The comparison is against a pointer that references memory
 		that was initialized "a long time ago."  The reason
 		this is safe is that even if misordering occurs, the
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
 Using hlist_nulls to protect read-mostly linked lists and
-objects using SLAB_DESTROY_BY_RCU allocations.
+objects using SLAB_TYPESAFE_BY_RCU allocations.
 Please read the basics in Documentation/RCU/listRCU.txt
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
 to solve following problem :
 A typical RCU linked list managing objects which are
-allocated with SLAB_DESTROY_BY_RCU kmem_cache can
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
 use following algos :
 1) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
 3) Remove algo
 --------------
 Nothing special here, we can use a standard RCU hlist deletion.
-But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
+But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
 very very fast (before the end of RCU grace period)
 if (put_last_reference_on(obj) {
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -1,9 +1,102 @@
 Using RCU's CPU Stall Detector
-The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall
+This document first discusses what sorts of issues RCU's CPU stall
-detector, which detects conditions that unduly delay RCU grace periods.
+detector can locate, and then discusses kernel parameters and Kconfig
-This module parameter enables CPU stall detection by default, but
+options that can be used to fine-tune the detector's operation.  Finally,
-may be overridden via boot-time parameter or at runtime via sysfs.
+this document explains the stall detector's "splat" format.
 What Causes RCU CPU Stall Warnings?
 So your kernel printed an RCU CPU stall warning.  The next question is
 "What caused it?"  The following problems can result in RCU CPU stall
 warnings:
 o	A CPU looping in an RCU read-side critical section.
 o	A CPU looping with interrupts disabled.
 o	A CPU looping with preemption disabled.  This condition can
 	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
 	stalls.
 o	A CPU looping with bottom halves disabled.  This condition can
 	result in RCU-sched and RCU-bh stalls.
 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
 	kernel without invoking schedule().  Note that cond_resched()
 	does not necessarily prevent RCU CPU stall warnings.  Therefore,
 	if the looping in the kernel is really expected and desirable
 	behavior, you might need to replace some of the cond_resched()
 	calls with calls to cond_resched_rcu_qs().
 o	Booting Linux using a console connection that is too slow to
 	keep up with the boot-time console-message rate.  For example,
 	a 115Kbaud serial console can be -way- too slow to keep up
 	with boot-time message rates, and will frequently result in
 	RCU CPU stall warning messages.  Especially if you have added
 	debug printk()s.
 o	Anything that prevents RCU's grace-period kthreads from running.
 	This can result in the "All QSes seen" console-log message.
 	This message will include information on when the kthread last
 	ran and how often it should be expected to run.
 o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
 	happen to preempt a low-priority task in the middle of an RCU
 	read-side critical section.   This is especially damaging if
 	that low-priority task is not permitted to run on any other CPU,
 	in which case the next RCU grace period can never complete, which
 	will eventually cause the system to run out of memory and hang.
 	While the system is in the process of running itself out of
 	memory, you might see stall-warning messages.
 o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
 	is running at a higher priority than the RCU softirq threads.
 	This will prevent RCU callbacks from ever being invoked,
 	and in a CONFIG_PREEMPT_RCU kernel will further prevent
 	RCU grace periods from ever completing.  Either way, the
 	system will eventually run out of memory and hang.  In the
 	CONFIG_PREEMPT_RCU case, you might see stall-warning
 	messages.
 o	A hardware or software issue shuts off the scheduler-clock
 	interrupt on a CPU that is not in dyntick-idle mode.  This
 	problem really has happened, and seems to be most likely to
 	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
 o	A bug in the RCU implementation.
 o	A hardware failure.  This is quite unlikely, but has occurred
 	at least once in real life.  A CPU failed in a running system,
 	becoming unresponsive, but not causing an immediate crash.
 	This resulted in a series of RCU CPU stall warnings, eventually
 	leading the realization that the CPU had failed.
 The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
 warning.  Note that SRCU does -not- have CPU stall warnings.  Please note
 that RCU only detects CPU stalls when there is a grace period in progress.
 No grace period, no CPU stall warnings.
 To diagnose the cause of the stall, inspect the stack traces.
 The offending function will usually be near the top of the stack.
 If you have a series of stall warnings from a single extended stall,
 comparing the stack traces can often help determine where the stall
 is occurring, which will usually be in the function nearest the top of
 that portion of the stack which remains the same from trace to trace.
 If you can reliably trigger the stall, ftrace can be quite helpful.
 RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
 and with RCU's event tracing.  For information on RCU's event tracing,
 see include/trace/events/rcu.h.
 Fine-Tuning the RCU CPU Stall Detector
 The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
 CPU stall detector, which detects conditions that unduly delay RCU grace
 periods.  This module parameter enables CPU stall detection by default,
 but may be overridden via boot-time parameter or at runtime via sysfs.
 The stall detector's idea of what constitutes "unduly delayed" is
 controlled by a set of kernel configuration variables and cpp macros:
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
 	And continues with the output of sched_show_task() for each
 	task stalling the current RCU-tasks grace period.
 Interpreting RCU's CPU Stall-Detector "Splats"
 For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
 it will print a message similar to the following:
@@ -178,89 +274,3 @@ grace period is in flight.
 It is entirely possible to see stall warnings from normal and from
 expedited grace periods at about the same time from the same run.
 What Causes RCU CPU Stall Warnings?
 So your kernel printed an RCU CPU stall warning.  The next question is
 "What caused it?"  The following problems can result in RCU CPU stall
 warnings:
 o	A CPU looping in an RCU read-side critical section.
 o	A CPU looping with interrupts disabled.  This condition can
 	result in RCU-sched and RCU-bh stalls.
 o	A CPU looping with preemption disabled.  This condition can
 	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
 	stalls.
 o	A CPU looping with bottom halves disabled.  This condition can
 	result in RCU-sched and RCU-bh stalls.
 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
 	kernel without invoking schedule().  Note that cond_resched()
 	does not necessarily prevent RCU CPU stall warnings.  Therefore,
 	if the looping in the kernel is really expected and desirable
 	behavior, you might need to replace some of the cond_resched()
 	calls with calls to cond_resched_rcu_qs().
 o	Booting Linux using a console connection that is too slow to
 	keep up with the boot-time console-message rate.  For example,
 	a 115Kbaud serial console can be -way- too slow to keep up
 	with boot-time message rates, and will frequently result in
 	RCU CPU stall warning messages.  Especially if you have added
 	debug printk()s.
 o	Anything that prevents RCU's grace-period kthreads from running.
 	This can result in the "All QSes seen" console-log message.
 	This message will include information on when the kthread last
 	ran and how often it should be expected to run.
 o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
 	happen to preempt a low-priority task in the middle of an RCU
 	read-side critical section.   This is especially damaging if
 	that low-priority task is not permitted to run on any other CPU,
 	in which case the next RCU grace period can never complete, which
 	will eventually cause the system to run out of memory and hang.
 	While the system is in the process of running itself out of
 	memory, you might see stall-warning messages.
 o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
 	is running at a higher priority than the RCU softirq threads.
 	This will prevent RCU callbacks from ever being invoked,
 	and in a CONFIG_PREEMPT_RCU kernel will further prevent
 	RCU grace periods from ever completing.  Either way, the
 	system will eventually run out of memory and hang.  In the
 	CONFIG_PREEMPT_RCU case, you might see stall-warning
 	messages.
 o	A hardware or software issue shuts off the scheduler-clock
 	interrupt on a CPU that is not in dyntick-idle mode.  This
 	problem really has happened, and seems to be most likely to
 	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
 o	A bug in the RCU implementation.
 o	A hardware failure.  This is quite unlikely, but has occurred
 	at least once in real life.  A CPU failed in a running system,
 	becoming unresponsive, but not causing an immediate crash.
 	This resulted in a series of RCU CPU stall warnings, eventually
 	leading the realization that the CPU had failed.
 The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
 warning.  Note that SRCU does -not- have CPU stall warnings.  Please note
 that RCU only detects CPU stalls when there is a grace period in progress.
 No grace period, no CPU stall warnings.
 To diagnose the cause of the stall, inspect the stack traces.
 The offending function will usually be near the top of the stack.
 If you have a series of stall warnings from a single extended stall,
 comparing the stack traces can often help determine where the stall
 is occurring, which will usually be in the function nearest the top of
 that portion of the stack which remains the same from trace to trace.
 If you can reliably trigger the stall, ftrace can be quite helpful.
 RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
 and with RCU's event tracing.  For information on RCU's event tracing,
 see include/trace/events/rcu.h.
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
 familiar locking primitives.  Its overhead makes it a non-starter for
 real-life use, as does its lack of scalability.  It is also unsuitable
 for realtime use, since it allows scheduling latency to "bleed" from
-one read-side critical section to another.
+one read-side critical section to another.  It also assumes recursive
 reader-writer locks:  If you try this with non-recursive locks, and
 you allow nested rcu_read_lock() calls, you can deadlock.
 However, it is probably the easiest implementation to relate to, so is
 a good starting point.
@@ -587,20 +589,21 @@ It is extremely simple:
 		write_unlock(&rcu_gp_mutex);
 	}
-[You can ignore rcu_assign_pointer() and rcu_dereference() without
+[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
-missing much.  But here they are anyway.  And whatever you do, don't
+much.  But here are simplified versions anyway.  And whatever you do,
-forget about them when submitting patches making use of RCU!]
+don't forget about them when submitting patches making use of RCU!]
-	#define rcu_assign_pointer(p, v)	({ \
+	#define rcu_assign_pointer(p, v) \
-							smp_wmb(); \
+	({ \
-							(p) = (v); \
+		smp_store_release(&(p), (v)); \
-						})
+	})
-	#define rcu_dereference(p)     ({ \
+	#define rcu_dereference(p) \
-					typeof(p) _________p1 = p; \
+	({ \
-					smp_read_barrier_depends(); \
+		typeof(p) _________p1 = p; \
-					(_________p1); \
+		smp_read_barrier_depends(); \
-					})
+		(_________p1); \
 	})
 The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
@@ -925,7 +928,8 @@ d.	Do you need RCU grace periods to complete even in the face
 e.	Is your workload too update-intensive for normal use of
 	RCU, but inappropriate for other synchronization mechanisms?
-	If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
+	If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
 	named SLAB_DESTROY_BY_RCU).  But please be careful!
 f.	Do you need read-side critical sections that are respected
 	even though they are in the middle of the idle loop, during
--- a/Documentation/acpi/acpi-lid.txt
+++ b/Documentation/acpi/acpi-lid.txt
@@ -59,20 +59,28 @@ button driver uses the following 3 modes in order not to trigger issues.
 If the userspace hasn't been prepared to ignore the unreliable "opened"
 events and the unreliable initial state notification, Linux users can use
 the following kernel parameters to handle the possible issues:
-A. button.lid_init_state=open:
+A. button.lid_init_state=method:
   When this option is specified, the ACPI button driver reports the
   initial lid state using the returning value of the _LID control method
   and whether the "opened"/"closed" events are paired fully relies on the
   firmware implementation.
   This option can be used to fix some platforms where the returning value
   of the _LID control method is reliable but the initial lid state
   notification is missing.
   This option is the default behavior during the period the userspace
   isn't ready to handle the buggy AML tables.
 B. button.lid_init_state=open:
   When this option is specified, the ACPI button driver always reports the
   initial lid state as "opened" and whether the "opened"/"closed" events
   are paired fully relies on the firmware implementation.
   This may fix some platforms where the returning value of the _LID
   control method is not reliable and the initial lid state notification is
   missing.
   This option is the default behavior during the period the userspace
   isn't ready to handle the buggy AML tables.
 If the userspace has been prepared to ignore the unreliable "opened" events
 and the unreliable initial state notification, Linux users should always
 use the following kernel parameter:
-B. button.lid_init_state=ignore:
+C. button.lid_init_state=ignore:
   When this option is specified, the ACPI button driver never reports the
   initial lid state and there is a compensation mechanism implemented to
   ensure that the reliable "closed" notifications can always be delievered
--- a/Documentation/acpi/aml-debugger.txt
+++ b/Documentation/acpi/aml-debugger.txt
@@ -15,7 +15,7 @@ kernel.
   CONFIG_ACPI_DEBUGGER=y
   CONFIG_ACPI_DEBUGGER_USER=m
-   The userspace utlities can be built from the kernel source tree using
+   The userspace utilities can be built from the kernel source tree using
   the following commands:
   $ cd tools
--- a/Documentation/acpi/dsd/graph.txt
+++ b/Documentation/acpi/dsd/graph.txt
@@ -0,0 +1,162 @@
 Graphs
 _DSD
 ----
 _DSD (Device Specific Data) [7] is a predefined ACPI device
 configuration object that can be used to convey information on
 hardware features which are not specifically covered by the ACPI
 specification [1][6]. There are two _DSD extensions that are relevant
 for graphs: property [4] and hierarchical data extensions [5]. The
 property extension provides generic key-value pairs whereas the
 hierarchical data extension supports nodes with references to other
 nodes, forming a tree. The nodes in the tree may contain properties as
 defined by the property extension. The two extensions together provide
 a tree-like structure with zero or more properties (key-value pairs)
 in each node of the tree.
 The data structure may be accessed at runtime by using the device_*
 and fwnode_* functions defined in include/linux/fwnode.h .
 Fwnode represents a generic firmware node object. It is independent on
 the firmware type. In ACPI, fwnodes are _DSD hierarchical data
 extensions objects. A device's _DSD object is represented by an
 fwnode.
 The data structure may be referenced to elsewhere in the ACPI tables
 by using a hard reference to the device itself and an index to the
 hierarchical data extension array on each depth.
 Ports and endpoints
 -------------------
 The port and endpoint concepts are very similar to those in Devicetree
 [3]. A port represents an interface in a device, and an endpoint
 represents a connection to that interface.
 All port nodes are located under the device's "_DSD" node in the
 hierarchical data extension tree. The property extension related to
 each port node must contain the key "port" and an integer value which
 is the number of the port. The object it refers to should be called "PRTX",
 where "X" is the number of the port.
 Further on, endpoints are located under the individual port nodes. The
 first hierarchical data extension package list entry of the endpoint
 nodes must begin with "endpoint" and must be followed by the number
 of the endpoint. The object it refers to should be called "EPXY", where
 "X" is the number of the port and "Y" is the number of the endpoint.
 Each port node contains a property extension key "port", the value of
 which is the number of the port node. The each endpoint is similarly numbered
 with a property extension key "endpoint". Port numbers must be unique within a
 device and endpoint numbers must be unique within a port.
 The endpoint reference uses property extension with "remote-endpoint" property
 name followed by a reference in the same package. Such references consist of the
 the remote device reference, number of the port in the device and finally the
 number of the endpoint in that port. Individual references thus appear as:
    Package() { device, port_number, endpoint_number }
 The references to endpoints must be always done both ways, to the
 remote endpoint and back from the referred remote endpoint node.
 A simple example of this is show below:
    Scope (\_SB.PCI0.I2C2)
    {
 	Device (CAM0)
 	{
 	    Name (_DSD, Package () {
 		ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
 		Package () {
 		    Package () { "compatible", Package () { "nokia,smia" } },
 		},
 		ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
 		Package () {
 		    Package () { "port0", "PRT0" },
 		}
 	    })
 	    Name (PRT0, Package() {
 		ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
 		Package () {
 		    Package () { "port", 0 },
 		},
 		ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
 		Package () {
 		    Package () { "endpoint0", "EP00" },
 		}
 	    })
 	    Name (EP00, Package() {
 		ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
 		Package () {
 		    Package () { "endpoint", 0 },
 		    Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, 4, 0 } },
 		}
 	    })
 	}
    }
    Scope (\_SB.PCI0)
    {
 	Device (ISP)
 	{
 	    Name (_DSD, Package () {
 		ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
 		Package () {
 		    Package () { "port4", "PRT4" },
 		}
 	    })
 	    Name (PRT4, Package() {
 		ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
 		Package () {
 		    Package () { "port", 4 }, /* CSI-2 port number */
 		},
 		ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
 		Package () {
 		    Package () { "endpoint0", "EP40" },
 		}
 	    })
 	    Name (EP40, Package() {
 		ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
 		Package () {
 		    Package () { "endpoint", 0 },
 		    Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, 0, 0 } },
 		}
 	    })
 	}
    }
 Here, the port 0 of the "CAM0" device is connected to the port 4 of
 the "ISP" device and vice versa.
 References
 ----------
 [1] _DSD (Device Specific Data) Implementation Guide.
    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm>,
    referenced 2016-10-03.
 [2] Devicetree. <URL:http://www.devicetree.org>, referenced 2016-10-03.
 [3] Documentation/devicetree/bindings/graph.txt
 [4] Device Properties UUID For _DSD.
    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
    referenced 2016-10-04.
 [5] Hierarchical Data Extension UUID For _DSD.
    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.pdf>,
    referenced 2016-10-04.
 [6] Advanced Configuration and Power Interface Specification.
    <URL:http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf>,
    referenced 2016-10-04.
 [7] _DSD Device Properties Usage Rules.
    Documentation/acpi/DSD-properties-rules.txt
--- a/Documentation/acpi/enumeration.txt
+++ b/Documentation/acpi/enumeration.txt
@@ -367,10 +367,10 @@ resulting child platform device.
 Device Tree namespace link device ID
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The Device Tree protocol uses device indentification based on the "compatible"
+The Device Tree protocol uses device identification based on the "compatible"
 property whose value is a string or an array of strings recognized as device
 identifiers by drivers and the driver core.  The set of all those strings may be
-regarded as a device indentification namespace analogous to the ACPI/PNP device
+regarded as a device identification namespace analogous to the ACPI/PNP device
 ID namespace.  Consequently, in principle it should not be necessary to allocate
 a new (and arguably redundant) ACPI/PNP device ID for a devices with an existing
 identification string in the Device Tree (DT) namespace, especially if that ID
@@ -381,7 +381,7 @@ In ACPI, the device identification object called _CID (Compatible ID) is used to
 list the IDs of devices the given one is compatible with, but those IDs must
 belong to one of the namespaces prescribed by the ACPI specification (see
 Section 6.1.2 of ACPI 6.0 for details) and the DT namespace is not one of them.
-Moreover, the specification mandates that either a _HID or an _ADR identificaion
+Moreover, the specification mandates that either a _HID or an _ADR identification
 object be present for all ACPI objects representing devices (Section 6.1 of ACPI
 6.0).  For non-enumerable bus types that object must be _HID and its value must
 be a device ID from one of the namespaces prescribed by the specification too.
--- a/Documentation/acpi/linuxized-acpica.txt
+++ b/Documentation/acpi/linuxized-acpica.txt
@@ -24,7 +24,7 @@ upstream.
   The homepage of ACPICA project is: www.acpica.org, it is maintained and
   supported by Intel Corporation.
-   The following figure depicts the Linux ACPI subystem where the ACPICA
+   The following figure depicts the Linux ACPI subsystem where the ACPICA
   adaptation is included:
      +---------------------------------------------------------+
@@ -110,7 +110,7 @@ upstream.
   Linux patches.  The patches generated by this process are referred to as
   "linuxized ACPICA patches".  The release process is carried out on a local
   copy the ACPICA git repository.  Each commit in the monthly release is
-   converted into a linuxized ACPICA patch.  Together, they form the montly
+   converted into a linuxized ACPICA patch.  Together, they form the monthly
   ACPICA release patchset for the Linux ACPI community.  This process is
   illustrated in the following figure:
@@ -165,7 +165,7 @@ upstream.
       <http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git>.
   Before the linuxized ACPICA patches are sent to the Linux ACPI community
-   for review, there is a quality ensurance build test process to reduce
+   for review, there is a quality assurance build test process to reduce
   porting issues.  Currently this build process only takes care of the
   following kernel configuration options:
   CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER
@@ -195,12 +195,12 @@ upstream.
      release utilities (please refer to Section 4 below for the details).
   3. Linux specific features - Sometimes it's impossible to use the
      current ACPICA APIs to implement features required by the Linux kernel,
-      so Linux developers occasionaly have to change ACPICA code directly.
+      so Linux developers occasionally have to change ACPICA code directly.
      Those changes may not be acceptable by ACPICA upstream and in such cases
      they are left as committed ACPICA divergences unless the ACPICA side can
      implement new mechanisms as replacements for them.
   4. ACPICA release fixups - ACPICA only tests commits using a set of the
-      user space simulation utilies, thus the linuxized ACPICA patches may
+      user space simulation utilities, thus the linuxized ACPICA patches may
      break the Linux kernel, leaving us build/boot failures.  In order to
      avoid breaking Linux bisection, fixes are applied directly to the
      linuxized ACPICA patches during the release process.  When the release
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -27,7 +27,7 @@ On what hardware does it run?
  today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
  UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
  IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
-  Xtensa, Tilera TILE, AVR32, ARC and Renesas M32R architectures.
+  Xtensa, Tilera TILE, ARC and Renesas M32R architectures.
  Linux is easily portable to most general-purpose 32- or 64-bit architectures
  as long as they have a paged memory management unit (PMMU) and a port of the
@@ -362,7 +362,7 @@ If something goes wrong
   as is, otherwise you will have to use the ``ksymoops`` program to make
   sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
   This utility can be downloaded from
-   ftp://ftp.<country>.kernel.org/pub/linux/utils/kernel/ksymoops/ .
+   https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
   Alternatively, you can do the dump lookup by hand:
 - In debugging dumps like the above, it helps enormously if you can
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -60,6 +60,7 @@ configure specific aspects of kernel behavior to your liking.
   mono
   java
   ras
   pm/index
 .. only::  subproject and html
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -1,3 +1,5 @@
 .. _kernelparameters:
 The kernel's command-line parameters
 ====================================
@@ -86,7 +88,6 @@ parameter is applicable::
 	APIC	APIC support is enabled.
 	APM	Advanced Power Management support is enabled.
 	ARM	ARM architecture is enabled.
 	AVR32	AVR32 architecture is enabled.
 	AX25	Appropriate AX.25 support is enabled.
 	BLACKFIN Blackfin architecture is enabled.
 	CLK	Common clock infrastructure is enabled.
@@ -197,7 +198,7 @@ and is between 256 and 4096 characters. It is defined in the file
 Finally, the [KMG] suffix is commonly described after a number of kernel
 parameter values. These 'K', 'M', and 'G' letters represent the _binary_
-multipliers 'Kilo', 'Mega', and 'Giga', equalling 2^10, 2^20, and 2^30
+multipliers 'Kilo', 'Mega', and 'Giga', equaling 2^10, 2^20, and 2^30
 bytes respectively. Such letter suffixes can also be entirely omitted:
 .. include:: kernel-parameters.txt
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -531,7 +531,6 @@
 			[ACPI] acpi_pm
 			[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
 				pxa_timer,timer3,32k_counter,timer0_1
 			[AVR32] avr32
 			[X86-32] pit,hpet,tsc;
 				scx200_hrt on Geode; cyclone on IBM x440
 			[MIPS] MIPS
@@ -867,6 +866,15 @@
 	dscc4.setup=	[NET]
 	dt_cpu_ftrs=	[PPC]
 			Format: {"off" | "known"}
 			Control how the dt_cpu_ftrs device-tree binding is
 			used for CPU feature discovery and setup (if it
 			exists).
 			off: Do not use it, fall back to legacy cpu table.
 			known: Do not pass through unknown features to guests
 			or userspace, only those that the kernel is aware of.
 	dump_apple_properties	[X86]
 			Dump name and content of EFI device properties on
 			x86 Macs.  Useful for driver authors to determine
@@ -973,7 +981,7 @@
 			A valid base address must be provided, and the serial
 			port must already be setup and configured.
-		armada3700_uart,<addr>
+		ar3700_uart,<addr>
 			Start an early, polled-mode console on the
 			Armada 3700 serial port at the specified
 			address. The serial port must already be setup
@@ -989,6 +997,7 @@
 			earlyprintk=ttySn[,baudrate]
 			earlyprintk=dbgp[debugController#]
 			earlyprintk=pciserial,bus:device.function[,baudrate]
 			earlyprintk=xdbc[xhciController#]
 			earlyprintk is useful when the kernel crashes before
 			the normal console is initialized. It is not enabled by
@@ -1578,6 +1587,15 @@
 			extended tables themselves, and also PASID support. With
 			this option set, extended tables will not be used even
 			on hardware which claims to support them.
 		tboot_noforce [Default Off]
 			Do not force the Intel IOMMU enabled under tboot.
 			By default, tboot will force Intel IOMMU on, which
 			could harm performance of some high-throughput
 			devices like 40GBit network cards, even if identity
 			mapping is enabled.
 			Note that using this option lowers the security
 			provided by tboot because it makes the system
 			vulnerable to DMA attacks.
 	intel_idle.max_cstate=	[KNL,HW,ACPI,X86]
 			0	disables intel_idle and fall back on acpi_idle.
@@ -1644,6 +1662,12 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.
 	iommu.passthrough=
 			[ARM64] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
 			0 - Use IOMMU translation for DMA.
 			1 - Bypass the IOMMU for DMA.
 			unset - Use IOMMU translation for DMA.
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
@@ -2419,13 +2443,7 @@
 			and gids from such clients.  This is intended to ease
 			migration from NFSv2/v3.
-	objlayoutdriver.osd_login_prog=
+	nmi_debug=	[KNL,SH] Specify one or more actions to take
 			[NFS] [OBJLAYOUT] sets the pathname to the program which
 			is used to automatically discover and login into new
 			osd-targets. Please see:
 			Documentation/filesystems/pnfs.txt for more explanations
 	nmi_debug=	[KNL,AVR32,SH] Specify one or more actions to take
 			when a NMI is triggered.
 			Format: [state][,regs][,debounce][,die]
@@ -3178,6 +3196,12 @@
 	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
 			See Documentation/blockdev/ramdisk.txt.
 	ras=option[,option,...]	[KNL] RAS-specific options
 		cec_disable	[X86]
 				Disable the Correctable Errors Collector,
 				see CONFIG_RAS_CEC help text.
 	rcu_nocbs=	[KNL]
 			The argument is a cpu list, as described above.
@@ -3779,6 +3803,21 @@
 	spia_pedr=
 	spia_peddr=
 	srcutree.exp_holdoff [KNL]
 			Specifies how many nanoseconds must elapse
 			since the end of the last SRCU grace period for
 			a given srcu_struct until the next normal SRCU
 			grace period will be considered for automatic
 			expediting.  Set to zero to disable automatic
 			expediting.
 	stack_guard_gap=	[MM]
 			override the default stack gap protection. The value
 			is in page units and it defines how many pages prior
 			to (for stacks growing down) resp. after (for stacks
 			growing up) the main stack are reserved for no other
 			mapping. Default value is 256 pages.
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
@@ -4121,6 +4160,9 @@
 	usbhid.mousepoll=
 			[USBHID] The interval which mice are to be polled at.
 	usbhid.jspoll=
 			[USBHID] The interval which joysticks are to be polled at.
 	usb-storage.delay_use=
 			[UMS] The delay in seconds before a new device is
 			scanned for Logical Units (default 1).
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
     array creation it will default to 0, though starting the array as
     ``clean`` will set it much larger.
-   new_dev
+  new_dev
     This file can be written but not read.  The value written should
     be a block device number as major:minor.  e.g. 8:0
     This will cause that device to be attached to the array, if it is
     available.  It will then appear at md/dev-XXX (depending on the
     name of the device) and further configuration is then possible.
-   safe_mode_delay
+  safe_mode_delay
     When an md array has seen no write requests for a certain period
     of time, it will be marked as ``clean``.  When another write
     request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
     period as a number of seconds.  The default is 200msec (0.200).
     Writing a value of 0 disables safemode.
-   array_state
+  array_state
     This file contains a single word which describes the current
     state of the array.  In many cases, the state can be set by
     writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
     once the array becomes non-degraded, and this fact has been
     recorded in the metadata.
  consistency_policy
     This indicates how the array maintains consistency in case of unexpected
     shutdown. It can be:
     none
       Array has no redundancy information, e.g. raid0, linear.
     resync
       Full resync is performed and all redundancy is regenerated when the
       array is started after unclean shutdown.
     bitmap
       Resync assisted by a write-intent bitmap.
     journal
       For raid4/5/6, journal device is used to log transactions and replay
       after unclean shutdown.
     ppl
       For raid5 only, Partial Parity Log is used to close the write hole and
       eliminate resync.
     The accepted values when writing to this file are ``ppl`` and ``resync``,
     used to enable and disable PPL.
 As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
 	adds bad blocks without acknowledging them. This is largely
 	for testing.
      ppl_sector, ppl_size
        Location and size (in sectors) of the space used for Partial Parity Log
        on this device.
 An active md device will also contain an entry for each active device
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -0,0 +1,701 @@
 .. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
 .. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
 =======================
 CPU Performance Scaling
 =======================
 ::
 Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 The Concept of CPU Performance Scaling
 ======================================
 The majority of modern processors are capable of operating in a number of
 different clock frequency and voltage configurations, often referred to as
 Operating Performance Points or P-states (in ACPI terminology).  As a rule,
 the higher the clock frequency and the higher the voltage, the more instructions
 can be retired by the CPU over a unit of time, but also the higher the clock
 frequency and the higher the voltage, the more energy is consumed over a unit of
 time (or the more power is drawn) by the CPU in the given P-state.  Therefore
 there is a natural tradeoff between the CPU capacity (the number of instructions
 that can be executed over a unit of time) and the power drawn by the CPU.
 In some situations it is desirable or even necessary to run the program as fast
 as possible and then there is no reason to use any P-states different from the
 highest one (i.e. the highest-performance frequency/voltage configuration
 available).  In some other cases, however, it may not be necessary to execute
 instructions so quickly and maintaining the highest available CPU capacity for a
 relatively long time without utilizing it entirely may be regarded as wasteful.
 It also may not be physically possible to maintain maximum CPU capacity for too
 long for thermal or power supply capacity reasons or similar.  To cover those
 cases, there are hardware interfaces allowing CPUs to be switched between
 different frequency/voltage configurations or (in the ACPI terminology) to be
 put into different P-states.
 Typically, they are used along with algorithms to estimate the required CPU
 capacity, so as to decide which P-states to put the CPUs into.  Of course, since
 the utilization of the system generally changes over time, that has to be done
 repeatedly on a regular basis.  The activity by which this happens is referred
 to as CPU performance scaling or CPU frequency scaling (because it involves
 adjusting the CPU clock frequency).
 CPU Performance Scaling in Linux
 ================================
 The Linux kernel supports CPU performance scaling by means of the ``CPUFreq``
 (CPU Frequency scaling) subsystem that consists of three layers of code: the
 core, scaling governors and scaling drivers.
 The ``CPUFreq`` core provides the common code infrastructure and user space
 interfaces for all platforms that support CPU performance scaling.  It defines
 the basic framework in which the other components operate.
 Scaling governors implement algorithms to estimate the required CPU capacity.
 As a rule, each governor implements one, possibly parametrized, scaling
 algorithm.
 Scaling drivers talk to the hardware.  They provide scaling governors with
 information on the available P-states (or P-state ranges in some cases) and
 access platform-specific hardware interfaces to change CPU P-states as requested
 by scaling governors.
 In principle, all available scaling governors can be used with every scaling
 driver.  That design is based on the observation that the information used by
 performance scaling algorithms for P-state selection can be represented in a
 platform-independent form in the majority of cases, so it should be possible
 to use the same performance scaling algorithm implemented in exactly the same
 way regardless of which scaling driver is used.  Consequently, the same set of
 scaling governors should be suitable for every supported platform.
 However, that observation may not hold for performance scaling algorithms
 based on information provided by the hardware itself, for example through
 feedback registers, as that information is typically specific to the hardware
 interface it comes from and may not be easily represented in an abstract,
 platform-independent way.  For this reason, ``CPUFreq`` allows scaling drivers
 to bypass the governor layer and implement their own performance scaling
 algorithms.  That is done by the |intel_pstate| scaling driver.
 ``CPUFreq`` Policy Objects
 ==========================
 In some cases the hardware interface for P-state control is shared by multiple
 CPUs.  That is, for example, the same register (or set of registers) is used to
 control the P-state of multiple CPUs at the same time and writing to it affects
 all of those CPUs simultaneously.
 Sets of CPUs sharing hardware P-state control interfaces are represented by
 ``CPUFreq`` as |struct cpufreq_policy| objects.  For consistency,
 |struct cpufreq_policy| is also used when there is only one CPU in the given
 set.
 The ``CPUFreq`` core maintains a pointer to a |struct cpufreq_policy| object for
 every CPU in the system, including CPUs that are currently offline.  If multiple
 CPUs share the same hardware P-state control interface, all of the pointers
 corresponding to them point to the same |struct cpufreq_policy| object.
 ``CPUFreq`` uses |struct cpufreq_policy| as its basic data type and the design
 of its user space interface is based on the policy concept.
 CPU Initialization
 ==================
 First of all, a scaling driver has to be registered for ``CPUFreq`` to work.
 It is only possible to register one scaling driver at a time, so the scaling
 driver is expected to be able to handle all CPUs in the system.
 The scaling driver may be registered before or after CPU registration.  If
 CPUs are registered earlier, the driver core invokes the ``CPUFreq`` core to
 take a note of all of the already registered CPUs during the registration of the
 scaling driver.  In turn, if any CPUs are registered after the registration of
 the scaling driver, the ``CPUFreq`` core will be invoked to take note of them
 at their registration time.
 In any case, the ``CPUFreq`` core is invoked to take note of any logical CPU it
 has not seen so far as soon as it is ready to handle that CPU.  [Note that the
 logical CPU may be a physical single-core processor, or a single core in a
 multicore processor, or a hardware thread in a physical processor or processor
 core.  In what follows "CPU" always means "logical CPU" unless explicitly stated
 otherwise and the word "processor" is used to refer to the physical part
 possibly including multiple logical CPUs.]
 Once invoked, the ``CPUFreq`` core checks if the policy pointer is already set
 for the given CPU and if so, it skips the policy object creation.  Otherwise,
 a new policy object is created and initialized, which involves the creation of
 a new policy directory in ``sysfs``, and the policy pointer corresponding to
 the given CPU is set to the new policy object's address in memory.
 Next, the scaling driver's ``->init()`` callback is invoked with the policy
 pointer of the new CPU passed to it as the argument.  That callback is expected
 to initialize the performance scaling hardware interface for the given CPU (or,
 more precisely, for the set of CPUs sharing the hardware interface it belongs
 to, represented by its policy object) and, if the policy object it has been
 called for is new, to set parameters of the policy, like the minimum and maximum
 frequencies supported by the hardware, the table of available frequencies (if
 the set of supported P-states is not a continuous range), and the mask of CPUs
 that belong to the same policy (including both online and offline CPUs).  That
 mask is then used by the core to populate the policy pointers for all of the
 CPUs in it.
 The next major initialization step for a new policy object is to attach a
 scaling governor to it (to begin with, that is the default scaling governor
 determined by the kernel configuration, but it may be changed later
 via ``sysfs``).  First, a pointer to the new policy object is passed to the
 governor's ``->init()`` callback which is expected to initialize all of the
 data structures necessary to handle the given policy and, possibly, to add
 a governor ``sysfs`` interface to it.  Next, the governor is started by
 invoking its ``->start()`` callback.
 That callback it expected to register per-CPU utilization update callbacks for
 all of the online CPUs belonging to the given policy with the CPU scheduler.
 The utilization update callbacks will be invoked by the CPU scheduler on
 important events, like task enqueue and dequeue, on every iteration of the
 scheduler tick or generally whenever the CPU utilization may change (from the
 scheduler's perspective).  They are expected to carry out computations needed
 to determine the P-state to use for the given policy going forward and to
 invoke the scaling driver to make changes to the hardware in accordance with
 the P-state selection.  The scaling driver may be invoked directly from
 scheduler context or asynchronously, via a kernel thread or workqueue, depending
 on the configuration and capabilities of the scaling driver and the governor.
 Similar steps are taken for policy objects that are not new, but were "inactive"
 previously, meaning that all of the CPUs belonging to them were offline.  The
 only practical difference in that case is that the ``CPUFreq`` core will attempt
 to use the scaling governor previously used with the policy that became
 "inactive" (and is re-initialized now) instead of the default governor.
 In turn, if a previously offline CPU is being brought back online, but some
 other CPUs sharing the policy object with it are online already, there is no
 need to re-initialize the policy object at all.  In that case, it only is
 necessary to restart the scaling governor so that it can take the new online CPU
 into account.  That is achieved by invoking the governor's ``->stop`` and
 ``->start()`` callbacks, in this order, for the entire policy.
 As mentioned before, the |intel_pstate| scaling driver bypasses the scaling
 governor layer of ``CPUFreq`` and provides its own P-state selection algorithms.
 Consequently, if |intel_pstate| is used, scaling governors are not attached to
 new policy objects.  Instead, the driver's ``->setpolicy()`` callback is invoked
 to register per-CPU utilization update callbacks for each policy.  These
 callbacks are invoked by the CPU scheduler in the same way as for scaling
 governors, but in the |intel_pstate| case they both determine the P-state to
 use and change the hardware configuration accordingly in one go from scheduler
 context.
 The policy objects created during CPU initialization and other data structures
 associated with them are torn down when the scaling driver is unregistered
 (which happens when the kernel module containing it is unloaded, for example) or
 when the last CPU belonging to the given policy in unregistered.
 Policy Interface in ``sysfs``
 =============================
 During the initialization of the kernel, the ``CPUFreq`` core creates a
 ``sysfs`` directory (kobject) called ``cpufreq`` under
 :file:`/sys/devices/system/cpu/`.
 That directory contains a ``policyX`` subdirectory (where ``X`` represents an
 integer number) for every policy object maintained by the ``CPUFreq`` core.
 Each ``policyX`` directory is pointed to by ``cpufreq`` symbolic links
 under :file:`/sys/devices/system/cpu/cpuY/` (where ``Y`` represents an integer
 that may be different from the one represented by ``X``) for all of the CPUs
 associated with (or belonging to) the given policy.  The ``policyX`` directories
 in :file:`/sys/devices/system/cpu/cpufreq` each contain policy-specific
 attributes (files) to control ``CPUFreq`` behavior for the corresponding policy
 objects (that is, for all of the CPUs associated with them).
 Some of those attributes are generic.  They are created by the ``CPUFreq`` core
 and their behavior generally does not depend on what scaling driver is in use
 and what scaling governor is attached to the given policy.  Some scaling drivers
 also add driver-specific attributes to the policy directories in ``sysfs`` to
 control policy-specific aspects of driver behavior.
 The generic attributes under :file:`/sys/devices/system/cpu/cpufreq/policyX/`
 are the following:
 ``affected_cpus``
 	List of online CPUs belonging to this policy (i.e. sharing the hardware
 	performance scaling interface represented by the ``policyX`` policy
 	object).
 ``bios_limit``
 	If the platform firmware (BIOS) tells the OS to apply an upper limit to
 	CPU frequencies, that limit will be reported through this attribute (if
 	present).
 	The existence of the limit may be a result of some (often unintentional)
 	BIOS settings, restrictions coming from a service processor or another
 	BIOS/HW-based mechanisms.
 	This does not cover ACPI thermal limitations which can be discovered
 	through a generic thermal driver.
 	This attribute is not present if the scaling driver in use does not
 	support it.
 ``cpuinfo_max_freq``
 	Maximum possible operating frequency the CPUs belonging to this policy
 	can run at (in kHz).
 ``cpuinfo_min_freq``
 	Minimum possible operating frequency the CPUs belonging to this policy
 	can run at (in kHz).
 ``cpuinfo_transition_latency``
 	The time it takes to switch the CPUs belonging to this policy from one
 	P-state to another, in nanoseconds.
 	If unknown or if known to be so high that the scaling driver does not
 	work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`)
 	will be returned by reads from this attribute.
 ``related_cpus``
 	List of all (online and offline) CPUs belonging to this policy.
 ``scaling_available_governors``
 	List of ``CPUFreq`` scaling governors present in the kernel that can
 	be attached to this policy or (if the |intel_pstate| scaling driver is
 	in use) list of scaling algorithms provided by the driver that can be
 	applied to this policy.
 	[Note that some governors are modular and it may be necessary to load a
 	kernel module for the governor held by it to become available and be
 	listed by this attribute.]
 ``scaling_cur_freq``
 	Current frequency of all of the CPUs belonging to this policy (in kHz).
 	For the majority of scaling drivers, this is the frequency of the last
 	P-state requested by the driver from the hardware using the scaling
 	interface provided by it, which may or may not reflect the frequency
 	the CPU is actually running at (due to hardware design and other
 	limitations).
 	Some scaling drivers (e.g. |intel_pstate|) attempt to provide
 	information more precisely reflecting the current CPU frequency through
 	this attribute, but that still may not be the exact current CPU
 	frequency as seen by the hardware at the moment.
 ``scaling_driver``
 	The scaling driver currently in use.
 ``scaling_governor``
 	The scaling governor currently attached to this policy or (if the
 	|intel_pstate| scaling driver is in use) the scaling algorithm
 	provided by the driver that is currently applied to this policy.
 	This attribute is read-write and writing to it will cause a new scaling
 	governor to be attached to this policy or a new scaling algorithm
 	provided by the scaling driver to be applied to it (in the
 	|intel_pstate| case), as indicated by the string written to this
 	attribute (which must be one of the names listed by the
 	``scaling_available_governors`` attribute described above).
 ``scaling_max_freq``
 	Maximum frequency the CPUs belonging to this policy are allowed to be
 	running at (in kHz).
 	This attribute is read-write and writing a string representing an
 	integer to it will cause a new limit to be set (it must not be lower
 	than the value of the ``scaling_min_freq`` attribute).
 ``scaling_min_freq``
 	Minimum frequency the CPUs belonging to this policy are allowed to be
 	running at (in kHz).
 	This attribute is read-write and writing a string representing a
 	non-negative integer to it will cause a new limit to be set (it must not
 	be higher than the value of the ``scaling_max_freq`` attribute).
 ``scaling_setspeed``
 	This attribute is functional only if the `userspace`_ scaling governor
 	is attached to the given policy.
 	It returns the last frequency requested by the governor (in kHz) or can
 	be written to in order to set a new frequency for the policy.
 Generic Scaling Governors
 =========================
 ``CPUFreq`` provides generic scaling governors that can be used with all
 scaling drivers.  As stated before, each of them implements a single, possibly
 parametrized, performance scaling algorithm.
 Scaling governors are attached to policy objects and different policy objects
 can be handled by different scaling governors at the same time (although that
 may lead to suboptimal results in some cases).
 The scaling governor for a given policy object can be changed at any time with
 the help of the ``scaling_governor`` policy attribute in ``sysfs``.
 Some governors expose ``sysfs`` attributes to control or fine-tune the scaling
 algorithms implemented by them.  Those attributes, referred to as governor
 tunables, can be either global (system-wide) or per-policy, depending on the
 scaling driver in use.  If the driver requires governor tunables to be
 per-policy, they are located in a subdirectory of each policy directory.
 Otherwise, they are located in a subdirectory under
 :file:`/sys/devices/system/cpu/cpufreq/`.  In either case the name of the
 subdirectory containing the governor tunables is the name of the governor
 providing them.
 ``performance``
 ---------------
 When attached to a policy object, this governor causes the highest frequency,
 within the ``scaling_max_freq`` policy limit, to be requested for that policy.
 The request is made once at that time the governor for the policy is set to
 ``performance`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq``
 policy limits change after that.
 ``powersave``
 -------------
 When attached to a policy object, this governor causes the lowest frequency,
 within the ``scaling_min_freq`` policy limit, to be requested for that policy.
 The request is made once at that time the governor for the policy is set to
 ``powersave`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq``
 policy limits change after that.
 ``userspace``
 -------------
 This governor does not do anything by itself.  Instead, it allows user space
 to set the CPU frequency for the policy it is attached to by writing to the
 ``scaling_setspeed`` attribute of that policy.
 ``schedutil``
 -------------
 This governor uses CPU utilization data available from the CPU scheduler.  It
 generally is regarded as a part of the CPU scheduler, so it can access the
 scheduler's internal data structures directly.
 It runs entirely in scheduler context, although in some cases it may need to
 invoke the scaling driver asynchronously when it decides that the CPU frequency
 should be changed for a given policy (that depends on whether or not the driver
 is capable of changing the CPU frequency from scheduler context).
 The actions of this governor for a particular CPU depend on the scheduling class
 invoking its utilization update callback for that CPU.  If it is invoked by the
 RT or deadline scheduling classes, the governor will increase the frequency to
 the allowed maximum (that is, the ``scaling_max_freq`` policy limit).  In turn,
 if it is invoked by the CFS scheduling class, the governor will use the
 Per-Entity Load Tracking (PELT) metric for the root control group of the
 given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_
 LWN.net article for a description of the PELT mechanism).  Then, the new
 CPU frequency to apply is computed in accordance with the formula
 	f = 1.25 * ``f_0`` * ``util`` / ``max``
 where ``util`` is the PELT number, ``max`` is the theoretical maximum of
 ``util``, and ``f_0`` is either the maximum possible CPU frequency for the given
 policy (if the PELT number is frequency-invariant), or the current CPU frequency
 (otherwise).
 This governor also employs a mechanism allowing it to temporarily bump up the
 CPU frequency for tasks that have been waiting on I/O most recently, called
 "IO-wait boosting".  That happens when the :c:macro:`SCHED_CPUFREQ_IOWAIT` flag
 is passed by the scheduler to the governor callback which causes the frequency
 to go up to the allowed maximum immediately and then draw back to the value
 returned by the above formula over time.
 This governor exposes only one tunable:
 ``rate_limit_us``
 	Minimum time (in microseconds) that has to pass between two consecutive
 	runs of governor computations (default: 1000 times the scaling driver's
 	transition latency).
 	The purpose of this tunable is to reduce the scheduler context overhead
 	of the governor which might be excessive without it.
 This governor generally is regarded as a replacement for the older `ondemand`_
 and `conservative`_ governors (described below), as it is simpler and more
 tightly integrated with the CPU scheduler, its overhead in terms of CPU context
 switches and similar is less significant, and it uses the scheduler's own CPU
 utilization metric, so in principle its decisions should not contradict the
 decisions made by the other parts of the scheduler.
 ``ondemand``
 ------------
 This governor uses CPU load as a CPU frequency selection metric.
 In order to estimate the current CPU load, it measures the time elapsed between
 consecutive invocations of its worker routine and computes the fraction of that
 time in which the given CPU was not idle.  The ratio of the non-idle (active)
 time to the total CPU time is taken as an estimate of the load.
 If this governor is attached to a policy shared by multiple CPUs, the load is
 estimated for all of them and the greatest result is taken as the load estimate
 for the entire policy.
 The worker routine of this governor has to run in process context, so it is
 invoked asynchronously (via a workqueue) and CPU P-states are updated from
 there if necessary.  As a result, the scheduler context overhead from this
 governor is minimum, but it causes additional CPU context switches to happen
 relatively often and the CPU P-state updates triggered by it can be relatively
 irregular.  Also, it affects its own CPU load metric by running code that
 reduces the CPU idle time (even though the CPU idle time is only reduced very
 slightly by it).
 It generally selects CPU frequencies proportional to the estimated load, so that
 the value of the ``cpuinfo_max_freq`` policy attribute corresponds to the load of
 1 (or 100%), and the value of the ``cpuinfo_min_freq`` policy attribute
 corresponds to the load of 0, unless when the load exceeds a (configurable)
 speedup threshold, in which case it will go straight for the highest frequency
 it is allowed to use (the ``scaling_max_freq`` policy limit).
 This governor exposes the following tunables:
 ``sampling_rate``
 	This is how often the governor's worker routine should run, in
 	microseconds.
 	Typically, it is set to values of the order of 10000 (10 ms).  Its
 	default value is equal to the value of ``cpuinfo_transition_latency``
 	for each policy this governor is attached to (but since the unit here
 	is greater by 1000, this means that the time represented by
 	``sampling_rate`` is 1000 times greater than the transition latency by
 	default).
 	If this tunable is per-policy, the following shell command sets the time
 	represented by it to be 750 times as high as the transition latency::
 	# echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
 ``min_sampling_rate``
 	The minimum value of ``sampling_rate``.
 	Equal to 10000 (10 ms) if :c:macro:`CONFIG_NO_HZ_COMMON` and
 	:c:data:`tick_nohz_active` are both set or to 20 times the value of
 	:c:data:`jiffies` in microseconds otherwise.
 ``up_threshold``
 	If the estimated CPU load is above this value (in percent), the governor
 	will set the frequency to the maximum value allowed for the policy.
 	Otherwise, the selected frequency will be proportional to the estimated
 	CPU load.
 ``ignore_nice_load``
 	If set to 1 (default 0), it will cause the CPU load estimation code to
 	treat the CPU time spent on executing tasks with "nice" levels greater
 	than 0 as CPU idle time.
 	This may be useful if there are tasks in the system that should not be
 	taken into account when deciding what frequency to run the CPUs at.
 	Then, to make that happen it is sufficient to increase the "nice" level
 	of those tasks above 0 and set this attribute to 1.
 ``sampling_down_factor``
 	Temporary multiplier, between 1 (default) and 100 inclusive, to apply to
 	the ``sampling_rate`` value if the CPU load goes above ``up_threshold``.
 	This causes the next execution of the governor's worker routine (after
 	setting the frequency to the allowed maximum) to be delayed, so the
 	frequency stays at the maximum level for a longer time.
 	Frequency fluctuations in some bursty workloads may be avoided this way
 	at the cost of additional energy spent on maintaining the maximum CPU
 	capacity.
 ``powersave_bias``
 	Reduction factor to apply to the original frequency target of the
 	governor (including the maximum value used when the ``up_threshold``
 	value is exceeded by the estimated CPU load) or sensitivity threshold
 	for the AMD frequency sensitivity powersave bias driver
 	(:file:`drivers/cpufreq/amd_freq_sensitivity.c`), between 0 and 1000
 	inclusive.
 	If the AMD frequency sensitivity powersave bias driver is not loaded,
 	the effective frequency to apply is given by
 		f * (1 - ``powersave_bias`` / 1000)
 	where f is the governor's original frequency target.  The default value
 	of this attribute is 0 in that case.
 	If the AMD frequency sensitivity powersave bias driver is loaded, the
 	value of this attribute is 400 by default and it is used in a different
 	way.
 	On Family 16h (and later) AMD processors there is a mechanism to get a
 	measured workload sensitivity, between 0 and 100% inclusive, from the
 	hardware.  That value can be used to estimate how the performance of the
 	workload running on a CPU will change in response to frequency changes.
 	The performance of a workload with the sensitivity of 0 (memory-bound or
 	IO-bound) is not expected to increase at all as a result of increasing
 	the CPU frequency, whereas workloads with the sensitivity of 100%
 	(CPU-bound) are expected to perform much better if the CPU frequency is
 	increased.
 	If the workload sensitivity is less than the threshold represented by
 	the ``powersave_bias`` value, the sensitivity powersave bias driver
 	will cause the governor to select a frequency lower than its original
 	target, so as to avoid over-provisioning workloads that will not benefit
 	from running at higher CPU frequencies.
 ``conservative``
 ----------------
 This governor uses CPU load as a CPU frequency selection metric.
 It estimates the CPU load in the same way as the `ondemand`_ governor described
 above, but the CPU frequency selection algorithm implemented by it is different.
 Namely, it avoids changing the frequency significantly over short time intervals
 which may not be suitable for systems with limited power supply capacity (e.g.
 battery-powered).  To achieve that, it changes the frequency in relatively
 small steps, one step at a time, up or down - depending on whether or not a
 (configurable) threshold has been exceeded by the estimated CPU load.
 This governor exposes the following tunables:
 ``freq_step``
 	Frequency step in percent of the maximum frequency the governor is
 	allowed to set (the ``scaling_max_freq`` policy limit), between 0 and
 	100 (5 by default).
 	This is how much the frequency is allowed to change in one go.  Setting
 	it to 0 will cause the default frequency step (5 percent) to be used
 	and setting it to 100 effectively causes the governor to periodically
 	switch the frequency between the ``scaling_min_freq`` and
 	``scaling_max_freq`` policy limits.
 ``down_threshold``
 	Threshold value (in percent, 20 by default) used to determine the
 	frequency change direction.
 	If the estimated CPU load is greater than this value, the frequency will
 	go up (by ``freq_step``).  If the load is less than this value (and the
 	``sampling_down_factor`` mechanism is not in effect), the frequency will
 	go down.  Otherwise, the frequency will not be changed.
 ``sampling_down_factor``
 	Frequency decrease deferral factor, between 1 (default) and 10
 	inclusive.
 	It effectively causes the frequency to go down ``sampling_down_factor``
 	times slower than it ramps up.
 Frequency Boost Support
 =======================
 Background
 ----------
 Some processors support a mechanism to raise the operating frequency of some
 cores in a multicore package temporarily (and above the sustainable frequency
 threshold for the whole package) under certain conditions, for example if the
 whole chip is not fully utilized and below its intended thermal or power budget.
 Different names are used by different vendors to refer to this functionality.
 For Intel processors it is referred to as "Turbo Boost", AMD calls it
 "Turbo-Core" or (in technical documentation) "Core Performance Boost" and so on.
 As a rule, it also is implemented differently by different vendors.  The simple
 term "frequency boost" is used here for brevity to refer to all of those
 implementations.
 The frequency boost mechanism may be either hardware-based or software-based.
 If it is hardware-based (e.g. on x86), the decision to trigger the boosting is
 made by the hardware (although in general it requires the hardware to be put
 into a special state in which it can control the CPU frequency within certain
 limits).  If it is software-based (e.g. on ARM), the scaling driver decides
 whether or not to trigger boosting and when to do that.
 The ``boost`` File in ``sysfs``
 -------------------------------
 This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls
 the "boost" setting for the whole system.  It is not present if the underlying
 scaling driver does not support the frequency boost mechanism (or supports it,
 but provides a driver-specific interface for controlling it, like
 |intel_pstate|).
 If the value in this file is 1, the frequency boost mechanism is enabled.  This
 means that either the hardware can be put into states in which it is able to
 trigger boosting (in the hardware-based case), or the software is allowed to
 trigger boosting (in the software-based case).  It does not mean that boosting
 is actually in use at the moment on any CPUs in the system.  It only means a
 permission to use the frequency boost mechanism (which still may never be used
 for other reasons).
 If the value in this file is 0, the frequency boost mechanism is disabled and
 cannot be used at all.
 The only values that can be written to this file are 0 and 1.
 Rationale for Boost Control Knob
 --------------------------------
 The frequency boost mechanism is generally intended to help to achieve optimum
 CPU performance on time scales below software resolution (e.g. below the
 scheduler tick interval) and it is demonstrably suitable for many workloads, but
 it may lead to problems in certain situations.
 For this reason, many systems make it possible to disable the frequency boost
 mechanism in the platform firmware (BIOS) setup, but that requires the system to
 be restarted for the setting to be adjusted as desired, which may not be
 practical at least in some cases.  For example:
  1. Boosting means overclocking the processor, although under controlled
     conditions.  Generally, the processor's energy consumption increases
     as a result of increasing its frequency and voltage, even temporarily.
     That may not be desirable on systems that switch to power sources of
     limited capacity, such as batteries, so the ability to disable the boost
     mechanism while the system is running may help there (but that depends on
     the workload too).
  2. In some situations deterministic behavior is more important than
     performance or energy consumption (or both) and the ability to disable
     boosting while the system is running may be useful then.
  3. To examine the impact of the frequency boost mechanism itself, it is useful
     to be able to run tests with and without boosting, preferably without
     restarting the system in the meantime.
  4. Reproducible results are important when running benchmarks.  Since
     the boosting functionality depends on the load of the whole package,
     single-thread performance may vary because of it which may lead to
     unreproducible results sometimes.  That can be avoided by disabling the
     frequency boost mechanism before running benchmarks sensitive to that
     issue.
 Legacy AMD ``cpb`` Knob
 -----------------------
 The AMD powernow-k8 scaling driver supports a ``sysfs`` knob very similar to
 the global ``boost`` one.  It is used for disabling/enabling the "Core
 Performance Boost" feature of some AMD processors.
 If present, that knob is located in every ``CPUFreq`` policy directory in
 ``sysfs`` (:file:`/sys/devices/system/cpu/cpufreq/policyX/`) and is called
 ``cpb``, which indicates a more fine grained control interface.  The actual
 implementation, however, works on the system-wide basis and setting that knob
 for one policy causes the same value of it to be set for all of the other
 policies at the same time.
 That knob is still supported on AMD processors that support its underlying
 hardware feature, but it may be configured out of the kernel (via the
 :c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option) and the global
 ``boost`` knob is present regardless.  Thus it is always possible use the
 ``boost`` knob instead of the ``cpb`` one which is highly recommended, as that
 is more consistent with what all of the other systems do (and the ``cpb`` knob
 may not be supported any more in the future).
 The ``cpb`` knob is never present for any processors without the underlying
 hardware feature (e.g. all Intel ones), even if the
 :c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
 .. _Per-entity load tracking: https://lwn.net/Articles/531853/
--- a/Documentation/admin-guide/pm/index.rst
+++ b/Documentation/admin-guide/pm/index.rst
@@ -0,0 +1,16 @@
 ================
 Power Management
 ================
 .. toctree::
   :maxdepth: 2
   cpufreq
   intel_pstate
 .. only::  subproject and html
   Indices
   =======
   * :ref:`genindex`
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -0,0 +1,755 @@
 ===============================================
 ``intel_pstate`` CPU Performance Scaling Driver
 ===============================================
 ::
 Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 General Information
 ===================
 ``intel_pstate`` is a part of the
 :doc:`CPU performance scaling subsystem <cpufreq>` in the Linux kernel
 (``CPUFreq``).  It is a scaling driver for the Sandy Bridge and later
 generations of Intel processors.  Note, however, that some of those processors
 may not be supported.  [To understand ``intel_pstate`` it is necessary to know
 how ``CPUFreq`` works in general, so this is the time to read :doc:`cpufreq` if
 you have not done that yet.]
 For the processors supported by ``intel_pstate``, the P-state concept is broader
 than just an operating frequency or an operating performance point (see the
 `LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
 information about that).  For this reason, the representation of P-states used
 by ``intel_pstate`` internally follows the hardware specification (for details
 refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
 Volume 3: System Programming Guide <SDM_>`_).  However, the ``CPUFreq`` core
 uses frequencies for identifying operating performance points of CPUs and
 frequencies are involved in the user space interface exposed by it, so
 ``intel_pstate`` maps its internal representation of P-states to frequencies too
 (fortunately, that mapping is unambiguous).  At the same time, it would not be
 practical for ``intel_pstate`` to supply the ``CPUFreq`` core with a table of
 available frequencies due to the possible size of it, so the driver does not do
 that.  Some functionality of the core is limited by that.
 Since the hardware P-state selection interface used by ``intel_pstate`` is
 available at the logical CPU level, the driver always works with individual
 CPUs.  Consequently, if ``intel_pstate`` is in use, every ``CPUFreq`` policy
 object corresponds to one logical CPU and ``CPUFreq`` policies are effectively
 equivalent to CPUs.  In particular, this means that they become "inactive" every
 time the corresponding CPU is taken offline and need to be re-initialized when
 it goes back online.
 ``intel_pstate`` is not modular, so it cannot be unloaded, which means that the
 only way to pass early-configuration-time parameters to it is via the kernel
 command line.  However, its configuration can be adjusted via ``sysfs`` to a
 great extent.  In some configurations it even is possible to unregister it via
 ``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and
 registered (see `below <status_attr_>`_).
 Operation Modes
 ===============
 ``intel_pstate`` can operate in three different modes: in the active mode with
 or without hardware-managed P-states support and in the passive mode.  Which of
 them will be in effect depends on what kernel command line options are used and
 on the capabilities of the processor.
 Active Mode
 -----------
 This is the default operation mode of ``intel_pstate``.  If it works in this
 mode, the ``scaling_driver`` policy attribute in ``sysfs`` for all ``CPUFreq``
 policies contains the string "intel_pstate".
 In this mode the driver bypasses the scaling governors layer of ``CPUFreq`` and
 provides its own scaling algorithms for P-state selection.  Those algorithms
 can be applied to ``CPUFreq`` policies in the same way as generic scaling
 governors (that is, through the ``scaling_governor`` policy attribute in
 ``sysfs``).  [Note that different P-state selection algorithms may be chosen for
 different policies, but that is not recommended.]
 They are not generic scaling governors, but their names are the same as the
 names of some of those governors.  Moreover, confusingly enough, they generally
 do not work in the same way as the generic governors they share the names with.
 For example, the ``powersave`` P-state selection algorithm provided by
 ``intel_pstate`` is not a counterpart of the generic ``powersave`` governor
 (roughly, it corresponds to the ``schedutil`` and ``ondemand`` governors).
 There are two P-state selection algorithms provided by ``intel_pstate`` in the
 active mode: ``powersave`` and ``performance``.  The way they both operate
 depends on whether or not the hardware-managed P-states (HWP) feature has been
 enabled in the processor and possibly on the processor model.
 Which of the P-state selection algorithms is used by default depends on the
 :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option.
 Namely, if that option is set, the ``performance`` algorithm will be used by
 default, and the other one will be used by default if it is not set.
 Active Mode With HWP
 ~~~~~~~~~~~~~~~~~~~~
 If the processor supports the HWP feature, it will be enabled during the
 processor initialization and cannot be disabled after that.  It is possible
 to avoid enabling it by passing the ``intel_pstate=no_hwp`` argument to the
 kernel in the command line.
 If the HWP feature has been enabled, ``intel_pstate`` relies on the processor to
 select P-states by itself, but still it can give hints to the processor's
 internal P-state selection logic.  What those hints are depends on which P-state
 selection algorithm has been applied to the given policy (or to the CPU it
 corresponds to).
 Even though the P-state selection is carried out by the processor automatically,
 ``intel_pstate`` registers utilization update callbacks with the CPU scheduler
 in this mode.  However, they are not used for running a P-state selection
 algorithm, but for periodic updates of the current CPU frequency information to
 be made available from the ``scaling_cur_freq`` policy attribute in ``sysfs``.
 HWP + ``performance``
 .....................
 In this configuration ``intel_pstate`` will write 0 to the processor's
 Energy-Performance Preference (EPP) knob (if supported) or its
 Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
 internal P-state selection logic is expected to focus entirely on performance.
 This will override the EPP/EPB setting coming from the ``sysfs`` interface
 (see `Energy vs Performance Hints`_ below).
 Also, in this configuration the range of P-states available to the processor's
 internal P-state selection logic is always restricted to the upper boundary
 (that is, the maximum P-state that the driver is allowed to use).
 HWP + ``powersave``
 ...................
 In this configuration ``intel_pstate`` will set the processor's
 Energy-Performance Preference (EPP) knob (if supported) or its
 Energy-Performance Bias (EPB) knob (otherwise) to whatever value it was
 previously set to via ``sysfs`` (or whatever default value it was
 set to by the platform firmware).  This usually causes the processor's
 internal P-state selection logic to be less performance-focused.
 Active Mode Without HWP
 ~~~~~~~~~~~~~~~~~~~~~~~
 This is the default operation mode for processors that do not support the HWP
 feature.  It also is used by default with the ``intel_pstate=no_hwp`` argument
 in the kernel command line.  However, in this mode ``intel_pstate`` may refuse
 to work with the given processor if it does not recognize it.  [Note that
 ``intel_pstate`` will never refuse to work with any processor with the HWP
 feature enabled.]
 In this mode ``intel_pstate`` registers utilization update callbacks with the
 CPU scheduler in order to run a P-state selection algorithm, either
 ``powersave`` or ``performance``, depending on the ``scaling_cur_freq`` policy
 setting in ``sysfs``.  The current CPU frequency information to be made
 available from the ``scaling_cur_freq`` policy attribute in ``sysfs`` is
 periodically updated by those utilization update callbacks too.
 ``performance``
 ...............
 Without HWP, this P-state selection algorithm is always the same regardless of
 the processor model and platform configuration.
 It selects the maximum P-state it is allowed to use, subject to limits set via
 ``sysfs``, every time the P-state selection computations are carried out by the
 driver's utilization update callback for the given CPU (that does not happen
 more often than every 10 ms), but the hardware configuration will not be changed
 if the new P-state is the same as the current one.
 This is the default P-state selection algorithm if the
 :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
 is set.
 ``powersave``
 .............
 Without HWP, this P-state selection algorithm generally depends on the
 processor model and/or the system profile setting in the ACPI tables and there
 are two variants of it.
 One of them is used with processors from the Atom line and (regardless of the
 processor model) on platforms with the system profile in the ACPI tables set to
 "mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
 "workstation".  It is also used with processors supporting the HWP feature if
 that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
 argument in the kernel command line).  It is similar to the algorithm
 implemented by the generic ``schedutil`` scaling governor except that the
 utilization metric used by it is based on numbers coming from feedback
 registers of the CPU.  It generally selects P-states proportional to the
 current CPU utilization, so it is referred to as the "proportional" algorithm.
 The second variant of the ``powersave`` P-state selection algorithm, used in all
 of the other cases (generally, on processors from the Core line, so it is
 referred to as the "Core" algorithm), is based on the values read from the APERF
 and MPERF feedback registers and the previously requested target P-state.
 It does not really take CPU utilization into account explicitly, but as a rule
 it causes the CPU P-state to ramp up very quickly in response to increased
 utilization which is generally desirable in server environments.
 Regardless of the variant, this algorithm is run by the driver's utilization
 update callback for the given CPU when it is invoked by the CPU scheduler, but
 not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
 particular case <Tuning Interface in debugfs_>`_).  Like in the ``performance``
 case, the hardware configuration is not touched if the new P-state turns out to
 be the same as the current one.
 This is the default P-state selection algorithm if the
 :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
 is not set.
 Passive Mode
 ------------
 This mode is used if the ``intel_pstate=passive`` argument is passed to the
 kernel in the command line (it implies the ``intel_pstate=no_hwp`` setting too).
 Like in the active mode without HWP support, in this mode ``intel_pstate`` may
 refuse to work with the given processor if it does not recognize it.
 If the driver works in this mode, the ``scaling_driver`` policy attribute in
 ``sysfs`` for all ``CPUFreq`` policies contains the string "intel_cpufreq".
 Then, the driver behaves like a regular ``CPUFreq`` scaling driver.  That is,
 it is invoked by generic scaling governors when necessary to talk to the
 hardware in order to change the P-state of a CPU (in particular, the
 ``schedutil`` governor can invoke it directly from scheduler context).
 While in this mode, ``intel_pstate`` can be used with all of the (generic)
 scaling governors listed by the ``scaling_available_governors`` policy attribute
 in ``sysfs`` (and the P-state selection algorithms described above are not
 used).  Then, it is responsible for the configuration of policy objects
 corresponding to CPUs and provides the ``CPUFreq`` core (and the scaling
 governors attached to the policy objects) with accurate information on the
 maximum and minimum operating frequencies supported by the hardware (including
 the so-called "turbo" frequency ranges).  In other words, in the passive mode
 the entire range of available P-states is exposed by ``intel_pstate`` to the
 ``CPUFreq`` core.  However, in this mode the driver does not register
 utilization update callbacks with the CPU scheduler and the ``scaling_cur_freq``
 information comes from the ``CPUFreq`` core (and is the last frequency selected
 by the current scaling governor for the given policy).
 .. _turbo:
 Turbo P-states Support
 ======================
 In the majority of cases, the entire range of P-states available to
 ``intel_pstate`` can be divided into two sub-ranges that correspond to
 different types of processor behavior, above and below a boundary that
 will be referred to as the "turbo threshold" in what follows.
 The P-states above the turbo threshold are referred to as "turbo P-states" and
 the whole sub-range of P-states they belong to is referred to as the "turbo
 range".  These names are related to the Turbo Boost technology allowing a
 multicore processor to opportunistically increase the P-state of one or more
 cores if there is enough power to do that and if that is not going to cause the
 thermal envelope of the processor package to be exceeded.
 Specifically, if software sets the P-state of a CPU core within the turbo range
 (that is, above the turbo threshold), the processor is permitted to take over
 performance scaling control for that core and put it into turbo P-states of its
 choice going forward.  However, that permission is interpreted differently by
 different processor generations.  Namely, the Sandy Bridge generation of
 processors will never use any P-states above the last one set by software for
 the given core, even if it is within the turbo range, whereas all of the later
 processor generations will take it as a license to use any P-states from the
 turbo range, even above the one set by software.  In other words, on those
 processors setting any P-state from the turbo range will enable the processor
 to put the given core into all turbo P-states up to and including the maximum
 supported one as it sees fit.
 One important property of turbo P-states is that they are not sustainable.  More
 precisely, there is no guarantee that any CPUs will be able to stay in any of
 those states indefinitely, because the power distribution within the processor
 package may change over time  or the thermal envelope it was designed for might
 be exceeded if a turbo P-state was used for too long.
 In turn, the P-states below the turbo threshold generally are sustainable.  In
 fact, if one of them is set by software, the processor is not expected to change
 it to a lower one unless in a thermal stress or a power limit violation
 situation (a higher P-state may still be used if it is set for another CPU in
 the same package at the same time, for example).
 Some processors allow multiple cores to be in turbo P-states at the same time,
 but the maximum P-state that can be set for them generally depends on the number
 of cores running concurrently.  The maximum turbo P-state that can be set for 3
 cores at the same time usually is lower than the analogous maximum P-state for
 2 cores, which in turn usually is lower than the maximum turbo P-state that can
 be set for 1 core.  The one-core maximum turbo P-state is thus the maximum
 supported one overall.
 The maximum supported turbo P-state, the turbo threshold (the maximum supported
 non-turbo P-state) and the minimum supported P-state are specific to the
 processor model and can be determined by reading the processor's model-specific
 registers (MSRs).  Moreover, some processors support the Configurable TDP
 (Thermal Design Power) feature and, when that feature is enabled, the turbo
 threshold effectively becomes a configurable value that can be set by the
 platform firmware.
 Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes
 the entire range of available P-states, including the whole turbo range, to the
 ``CPUFreq`` core and (in the passive mode) to generic scaling governors.  This
 generally causes turbo P-states to be set more often when ``intel_pstate`` is
 used relative to ACPI-based CPU performance scaling (see `below <acpi-cpufreq_>`_
 for more information).
 Moreover, since ``intel_pstate`` always knows what the real turbo threshold is
 (even if the Configurable TDP feature is enabled in the processor), its
 ``no_turbo`` attribute in ``sysfs`` (described `below <no_turbo_attr_>`_) should
 work as expected in all cases (that is, if set to disable turbo P-states, it
 always should prevent ``intel_pstate`` from using them).
 Processor Support
 =================
 To handle a given processor ``intel_pstate`` requires a number of different
 pieces of information on it to be known, including:
 * The minimum supported P-state.
 * The maximum supported `non-turbo P-state <turbo_>`_.
 * Whether or not turbo P-states are supported at all.
 * The maximum supported `one-core turbo P-state <turbo_>`_ (if turbo P-states
   are supported).
 * The scaling formula to translate the driver's internal representation
   of P-states into frequencies and the other way around.
 Generally, ways to obtain that information are specific to the processor model
 or family.  Although it often is possible to obtain all of it from the processor
 itself (using model-specific registers), there are cases in which hardware
 manuals need to be consulted to get to it too.
 For this reason, there is a list of supported processors in ``intel_pstate`` and
 the driver initialization will fail if the detected processor is not in that
 list, unless it supports the `HWP feature <Active Mode_>`_.  [The interface to
 obtain all of the information listed above is the same for all of the processors
 supporting the HWP feature, which is why they all are supported by
 ``intel_pstate``.]
 User Space Interface in ``sysfs``
 =================================
 Global Attributes
 -----------------
 ``intel_pstate`` exposes several global attributes (files) in ``sysfs`` to
 control its functionality at the system level.  They are located in the
 ``/sys/devices/system/cpu/cpufreq/intel_pstate/`` directory and affect all
 CPUs.
 Some of them are not present if the ``intel_pstate=per_cpu_perf_limits``
 argument is passed to the kernel in the command line.
 ``max_perf_pct``
 	Maximum P-state the driver is allowed to set in percent of the
 	maximum supported performance level (the highest supported `turbo
 	P-state <turbo_>`_).
 	This attribute will not be exposed if the
 	``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
 	command line.
 ``min_perf_pct``
 	Minimum P-state the driver is allowed to set in percent of the
 	maximum supported performance level (the highest supported `turbo
 	P-state <turbo_>`_).
 	This attribute will not be exposed if the
 	``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel
 	command line.
 ``num_pstates``
 	Number of P-states supported by the processor (between 0 and 255
 	inclusive) including both turbo and non-turbo P-states (see
 	`Turbo P-states Support`_).
 	The value of this attribute is not affected by the ``no_turbo``
 	setting described `below <no_turbo_attr_>`_.
 	This attribute is read-only.
 ``turbo_pct``
 	Ratio of the `turbo range <turbo_>`_ size to the size of the entire
 	range of supported P-states, in percent.
 	This attribute is read-only.
 .. _no_turbo_attr:
 ``no_turbo``
 	If set (equal to 1), the driver is not allowed to set any turbo P-states
 	(see `Turbo P-states Support`_).  If unset (equalt to 0, which is the
 	default), turbo P-states can be set by the driver.
 	[Note that ``intel_pstate`` does not support the general ``boost``
 	attribute (supported by some other scaling drivers) which is replaced
 	by this one.]
 	This attrubute does not affect the maximum supported frequency value
 	supplied to the ``CPUFreq`` core and exposed via the policy interface,
 	but it affects the maximum possible value of per-policy P-state	limits
 	(see `Interpretation of Policy Attributes`_ below for details).
 .. _status_attr:
 ``status``
 	Operation mode of the driver: "active", "passive" or "off".
 	"active"
 		The driver is functional and in the `active mode
 		<Active Mode_>`_.
 	"passive"
 		The driver is functional and in the `passive mode
 		<Passive Mode_>`_.
 	"off"
 		The driver is not functional (it is not registered as a scaling
 		driver with the ``CPUFreq`` core).
 	This attribute can be written to in order to change the driver's
 	operation mode or to unregister it.  The string written to it must be
 	one of the possible values of it and, if successful, the write will
 	cause the driver to switch over to the operation mode represented by
 	that string - or to be unregistered in the "off" case.  [Actually,
 	switching over from the active mode to the passive mode or the other
 	way around causes the driver to be unregistered and registered again
 	with a different set of callbacks, so all of its settings (the global
 	as well as the per-policy ones) are then reset to their default
 	values, possibly depending on the target operation mode.]
 	That only is supported in some configurations, though (for example, if
 	the `HWP feature is enabled in the processor <Active Mode With HWP_>`_,
 	the operation mode of the driver cannot be changed), and if it is not
 	supported in the current configuration, writes to this attribute with
 	fail with an appropriate error.
 Interpretation of Policy Attributes
 -----------------------------------
 The interpretation of some ``CPUFreq`` policy attributes described in
 :doc:`cpufreq` is special with ``intel_pstate`` as the current scaling driver
 and it generally depends on the driver's `operation mode <Operation Modes_>`_.
 First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and
 ``scaling_cur_freq`` attributes are produced by applying a processor-specific
 multiplier to the internal P-state representation used by ``intel_pstate``.
 Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq``
 attributes are capped by the frequency corresponding to the maximum P-state that
 the driver is allowed to set.
 If the ``no_turbo`` `global attribute <no_turbo_attr_>`_ is set, the driver is
 not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq``
 and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency.
 Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and
 ``scaling_min_freq`` to go down to that value if they were above it before.
 However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be
 restored after unsetting ``no_turbo``, unless these attributes have been written
 to after ``no_turbo`` was set.
 If ``no_turbo`` is not set, the maximum possible value of ``scaling_max_freq``
 and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state,
 which also is the value of ``cpuinfo_max_freq`` in either case.
 Next, the following policy attributes have special meaning if
 ``intel_pstate`` works in the `active mode <Active Mode_>`_:
 ``scaling_available_governors``
 	List of P-state selection algorithms provided by ``intel_pstate``.
 ``scaling_governor``
 	P-state selection algorithm provided by ``intel_pstate`` currently in
 	use with the given policy.
 ``scaling_cur_freq``
 	Frequency of the average P-state of the CPU represented by the given
 	policy for the time interval between the last two invocations of the
 	driver's utilization update callback by the CPU scheduler for that CPU.
 The meaning of these attributes in the `passive mode <Passive Mode_>`_ is the
 same as for other scaling drivers.
 Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate``
 depends on the operation mode of the driver.  Namely, it is either
 "intel_pstate" (in the `active mode <Active Mode_>`_) or "intel_cpufreq" (in the
 `passive mode <Passive Mode_>`_).
 Coordination of P-State Limits
 ------------------------------
 ``intel_pstate`` allows P-state limits to be set in two ways: with the help of
 the ``max_perf_pct`` and ``min_perf_pct`` `global attributes
 <Global Attributes_>`_ or via the ``scaling_max_freq`` and ``scaling_min_freq``
 ``CPUFreq`` policy attributes.  The coordination between those limits is based
 on the following rules, regardless of the current operation mode of the driver:
 1. All CPUs are affected by the global limits (that is, none of them can be
    requested to run faster than the global maximum and none of them can be
    requested to run slower than the global minimum).
 2. Each individual CPU is affected by its own per-policy limits (that is, it
    cannot be requested to run faster than its own per-policy maximum and it
    cannot be requested to run slower than its own per-policy minimum).
 3. The global and per-policy limits can be set independently.
 If the `HWP feature is enabled in the processor <Active Mode With HWP_>`_, the
 resulting effective values are written into its registers whenever the limits
 change in order to request its internal P-state selection logic to always set
 P-states within these limits.  Otherwise, the limits are taken into account by
 scaling governors (in the `passive mode <Passive Mode_>`_) and by the driver
 every time before setting a new P-state for a CPU.
 Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument
 is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed
 at all and the only way to set the limits is by using the policy attributes.
 Energy vs Performance Hints
 ---------------------------
 If ``intel_pstate`` works in the `active mode with the HWP feature enabled
 <Active Mode With HWP_>`_ in the processor, additional attributes are present
 in every ``CPUFreq`` policy directory in ``sysfs``.  They are intended to allow
 user space to help ``intel_pstate`` to adjust the processor's internal P-state
 selection logic by focusing it on performance or on energy-efficiency, or
 somewhere between the two extremes:
 ``energy_performance_preference``
 	Current value of the energy vs performance hint for the given policy
 	(or the CPU represented by it).
 	The hint can be changed by writing to this attribute.
 ``energy_performance_available_preferences``
 	List of strings that can be written to the
 	``energy_performance_preference`` attribute.
 	They represent different energy vs performance hints and should be
 	self-explanatory, except that ``default`` represents whatever hint
 	value was set by the platform firmware.
 Strings written to the ``energy_performance_preference`` attribute are
 internally translated to integer values written to the processor's
 Energy-Performance Preference (EPP) knob (if supported) or its
 Energy-Performance Bias (EPB) knob.
 [Note that tasks may by migrated from one CPU to another by the scheduler's
 load-balancing algorithm and if different energy vs performance hints are
 set for those CPUs, that may lead to undesirable outcomes.  To avoid such
 issues it is better to set the same energy vs performance hint for all CPUs
 or to pin every task potentially sensitive to them to a specific CPU.]
 .. _acpi-cpufreq:
 ``intel_pstate`` vs ``acpi-cpufreq``
 ====================================
 On the majority of systems supported by ``intel_pstate``, the ACPI tables
 provided by the platform firmware contain ``_PSS`` objects returning information
 that can be used for CPU performance scaling (refer to the `ACPI specification`_
 for details on the ``_PSS`` objects and the format of the information returned
 by them).
 The information returned by the ACPI ``_PSS`` objects is used by the
 ``acpi-cpufreq`` scaling driver.  On systems supported by ``intel_pstate``
 the ``acpi-cpufreq`` driver uses the same hardware CPU performance scaling
 interface, but the set of P-states it can use is limited by the ``_PSS``
 output.
 On those systems each ``_PSS`` object returns a list of P-states supported by
 the corresponding CPU which basically is a subset of the P-states range that can
 be used by ``intel_pstate`` on the same system, with one exception: the whole
 `turbo range <turbo_>`_ is represented by one item in it (the topmost one).  By
 convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz
 than the frequency of the highest non-turbo P-state listed by it, but the
 corresponding P-state representation (following the hardware specification)
 returned for it matches the maximum supported turbo P-state (or is the
 special value 255 meaning essentially "go as high as you can get").
 The list of P-states returned by ``_PSS`` is reflected by the table of
 available frequencies supplied by ``acpi-cpufreq`` to the ``CPUFreq`` core and
 scaling governors and the minimum and maximum supported frequencies reported by
 it come from that list as well.  In particular, given the special representation
 of the turbo range described above, this means that the maximum supported
 frequency reported by ``acpi-cpufreq`` is higher by 1 MHz than the frequency
 of the highest supported non-turbo P-state listed by ``_PSS`` which, of course,
 affects decisions made by the scaling governors, except for ``powersave`` and
 ``performance``.
 For example, if a given governor attempts to select a frequency proportional to
 estimated CPU load and maps the load of 100% to the maximum supported frequency
 (possibly multiplied by a constant), then it will tend to choose P-states below
 the turbo threshold if ``acpi-cpufreq`` is used as the scaling driver, because
 in that case the turbo range corresponds to a small fraction of the frequency
 band it can use (1 MHz vs 1 GHz or more).  In consequence, it will only go to
 the turbo range for the highest loads and the other loads above 50% that might
 benefit from running at turbo frequencies will be given non-turbo P-states
 instead.
 One more issue related to that may appear on systems supporting the
 `Configurable TDP feature <turbo_>`_ allowing the platform firmware to set the
 turbo threshold.  Namely, if that is not coordinated with the lists of P-states
 returned by ``_PSS`` properly, there may be more than one item corresponding to
 a turbo P-state in those lists and there may be a problem with avoiding the
 turbo range (if desirable or necessary).  Usually, to avoid using turbo
 P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed
 by ``_PSS``, but that is not sufficient when there are other turbo P-states in
 the list returned by it.
 Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the
 `passive mode <Passive Mode_>`_, except that the number of P-states it can set
 is limited to the ones listed by the ACPI ``_PSS`` objects.
 Kernel Command Line Options for ``intel_pstate``
 ================================================
 Several kernel command line options can be used to pass early-configuration-time
 parameters to ``intel_pstate`` in order to enforce specific behavior of it.  All
 of them have to be prepended with the ``intel_pstate=`` prefix.
 ``disable``
 	Do not register ``intel_pstate`` as the scaling driver even if the
 	processor is supported by it.
 ``passive``
 	Register ``intel_pstate`` in the `passive mode <Passive Mode_>`_ to
 	start with.
 	This option implies the ``no_hwp`` one described below.
 ``force``
 	Register ``intel_pstate`` as the scaling driver instead of
 	``acpi-cpufreq`` even if the latter is preferred on the given system.
 	This may prevent some platform features (such as thermal controls and
 	power capping) that rely on the availability of ACPI P-states
 	information from functioning as expected, so it should be used with
 	caution.
 	This option does not work with processors that are not supported by
 	``intel_pstate`` and on platforms where the ``pcc-cpufreq`` scaling
 	driver is used instead of ``acpi-cpufreq``.
 ``no_hwp``
 	Do not enable the `hardware-managed P-states (HWP) feature
 	<Active Mode With HWP_>`_ even if it is supported by the processor.
 ``hwp_only``
 	Register ``intel_pstate`` as the scaling driver only if the
 	`hardware-managed P-states (HWP) feature <Active Mode With HWP_>`_ is
 	supported by the processor.
 ``support_acpi_ppc``
 	Take ACPI ``_PPC`` performance limits into account.
 	If the preferred power management profile in the FADT (Fixed ACPI
 	Description Table) is set to "Enterprise Server" or "Performance
 	Server", the ACPI ``_PPC`` limits are taken into account by default
 	and this option has no effect.
 ``per_cpu_perf_limits``
 	Use per-logical-CPU P-State limits (see `Coordination of P-state
 	Limits`_ for details).
 Diagnostics and Tuning
 ======================
 Trace Events
 ------------
 There are two static trace events that can be used for ``intel_pstate``
 diagnostics.  One of them is the ``cpu_frequency`` trace event generally used
 by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific
 to ``intel_pstate``.  Both of them are triggered by ``intel_pstate`` only if
 it works in the `active mode <Active Mode_>`_.
 The following sequence of shell commands can be used to enable them and see
 their output (if the kernel is generally configured to support event tracing)::
 # cd /sys/kernel/debug/tracing/
 # echo 1 > events/power/pstate_sample/enable
 # echo 1 > events/power/cpu_frequency/enable
 # cat trace
 gnome-terminal--4510  [001] ..s.  1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476
 cat-5235  [002] ..s.  1177.681723: cpu_frequency: state=2900000 cpu_id=2
 If ``intel_pstate`` works in the `passive mode <Passive Mode_>`_, the
 ``cpu_frequency`` trace event will be triggered either by the ``schedutil``
 scaling governor (for the policies it is attached to), or by the ``CPUFreq``
 core (for the policies with other scaling governors).
 ``ftrace``
 ----------
 The ``ftrace`` interface can be used for low-level diagnostics of
 ``intel_pstate``.  For example, to check how often the function to set a
 P-state is called, the ``ftrace`` filter can be set to to
 :c:func:`intel_pstate_set_pstate`::
 # cd /sys/kernel/debug/tracing/
 # cat available_filter_functions | grep -i pstate
 intel_pstate_set_pstate
 intel_pstate_cpu_init
 ...
 # echo intel_pstate_set_pstate > set_ftrace_filter
 # echo function > current_tracer
 # cat trace | head -15
 # tracer: function
 #
 # entries-in-buffer/entries-written: 80/80   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
             Xorg-3129  [000] ..s.  2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
  gnome-terminal--4510  [002] ..s.  2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
      gnome-shell-3409  [001] ..s.  2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
           <idle>-0     [000] ..s.  2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
 Tuning Interface in ``debugfs``
 -------------------------------
 The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
 processors in the active mode <powersave_>`_ is based on a `PID controller`_
 whose parameters were chosen to address a number of different use cases at the
 same time.  However, it still is possible to fine-tune it to a specific workload
 and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
 provided for this purpose.  [Note that the ``pstate_snb`` directory will be
 present only if the specific P-state selection algorithm matching the interface
 in it actually is in use.]
 The following files present in that directory can be used to modify the PID
 controller parameters at run time:
 | ``deadband``
 | ``d_gain_pct``
 | ``i_gain_pct``
 | ``p_gain_pct``
 | ``sample_rate_ms``
 | ``setpoint``
 Note, however, that achieving desirable results this way generally requires
 expert-level understanding of the power vs performance tradeoff, so extra care
 is recommended when attempting to do that.
 .. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
 .. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
 .. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
 .. _PID controller: https://en.wikipedia.org/wiki/PID_controller
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -8,7 +8,7 @@ RAS concepts
 ************
 Reliability, Availability and Serviceability (RAS) is a concept used on
-servers meant to measure their robusteness.
+servers meant to measure their robustness.
 Reliability
  is the probability that a system will produce correct outputs.
@@ -42,13 +42,13 @@ Among the monitoring measures, the most usual ones include:
 * CPU – detect errors at instruction execution and at L1/L2/L3 caches;
 * Memory – add error correction logic (ECC) to detect and correct errors;
-* I/O – add CRC checksums for tranfered data;
+* I/O – add CRC checksums for transferred data;
 * Storage – RAID, journal file systems, checksums,
  Self-Monitoring, Analysis and Reporting Technology (SMART).
 By monitoring the number of occurrences of error detections, it is possible
 to identify if the probability of hardware errors is increasing, and, on such
-case, do a preventive maintainance to replace a degrated component while
+case, do a preventive maintenance to replace a degraded component while
 those errors are correctable.
 Types of errors
@@ -121,7 +121,7 @@ using the ``dmidecode`` tool. For example, on a desktop machine, it shows::
 On the above example, a DDR4 SO-DIMM memory module is located at the
 system's memory labeled as "BANK 0", as given by the *bank locator* field.
 Please notice that, on such system, the *total width* is equal to the
-*data witdh*. It means that such memory module doesn't have error
+*data width*. It means that such memory module doesn't have error
 detection/correction mechanisms.
 Unfortunately, not all systems use the same field to specify the memory
@@ -145,7 +145,7 @@ bank. On this example, from an older server, ``dmidecode`` shows::
 There, the DDR3 RDIMM memory module is located at the system's memory labeled
 as "DIMM_A1", as given by the *locator* field. Please notice that this
-memory module has 64 bits of *data witdh* and 72 bits of *total width*. So,
+memory module has 64 bits of *data width* and 72 bits of *total width*. So,
 it has 8 extra bits to be used by error detection and correction mechanisms.
 Such kind of memory is called Error-correcting code memory (ECC memory).
@@ -186,7 +186,7 @@ Architecture (MCA)\ [#f3]_.
 .. [#f1] Please notice that several memory controllers allow operation on a
  mode called "Lock-Step", where it groups two memory modules together,
  doing 128-bit reads/writes. That gives 16 bits for error correction, with
-  significatively improves the error correction mechanism, at the expense
+  significantly improves the error correction mechanism, at the expense
  that, when an error happens, there's no way to know what memory module is
  to blame. So, it has to blame both memory modules.
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -14,14 +14,17 @@ Contact
 The Linux kernel security team can be contacted by email at
 <security@kernel.org>.  This is a private list of security officers
 who will help verify the bug report and develop and release a fix.
-It is possible that the security team will bring in extra help from
+If you already have a fix, please include it with your report, as
-area maintainers to understand and fix the security vulnerability.
+that can speed up the process considerably.  It is possible that the
 security team will bring in extra help from area maintainers to
 understand and fix the security vulnerability.
 As it is with any bug, the more information provided the easier it
 will be to diagnose and fix.  Please review the procedure outlined in
-admin-guide/reporting-bugs.rst if you are unclear about what information is helpful.
+admin-guide/reporting-bugs.rst if you are unclear about what
-Any exploit code is very helpful and will not be released without
+information is helpful.  Any exploit code is very helpful and will not
-consent from the reporter unless it has already been made public.
+be released without consent from the reporter unless it has already been
 made public.
 Disclosure
 ----------
@@ -39,6 +42,32 @@ disclosure is from immediate (esp. if it's already publicly known)
 to a few weeks.  As a basic default policy, we expect report date to
 disclosure date to be on the order of 7 days.
 Coordination
 ------------
 Fixes for sensitive bugs, such as those that might lead to privilege
 escalations, may need to be coordinated with the private
 <linux-distros@vs.openwall.org> mailing list so that distribution vendors
 are well prepared to issue a fixed kernel upon public disclosure of the
 upstream fix. Distros will need some time to test the proposed patch and
 will generally request at least a few days of embargo, and vendor update
 publication prefers to happen Tuesday through Thursday. When appropriate,
 the security team can assist with this coordination, or the reporter can
 include linux-distros from the start. In this case, remember to prefix
 the email Subject line with "[vs]" as described in the linux-distros wiki:
 <http://oss-security.openwall.org/wiki/mailing-lists/distros#how-to-use-the-lists>
 CVE assignment
 --------------
 The security team does not normally assign CVEs, nor do we require them
 for reports or fixes, as this can needlessly complicate the process and
 may delay the bug handling. If a reporter wishes to have a CVE identifier
 assigned ahead of public disclosure, they will need to contact the private
 linux-distros list, described above. When such a CVE identifier is known
 before a patch is provided, it is desirable to mention it in the commit
 message, though.
 Non-disclosure agreements
 -------------------------
--- a/Documentation/admin-guide/sysrq.rst
+++ b/Documentation/admin-guide/sysrq.rst
@@ -212,7 +212,8 @@ I hit SysRq, but nothing seems to happen, what's wrong?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 There are some keyboards that produce a different keycode for SysRq than the
-pre-defined value of 99 (see ``KEY_SYSRQ`` in ``include/linux/input.h``), or
+pre-defined value of 99
 (see ``KEY_SYSRQ`` in ``include/uapi/linux/input-event-codes.h``), or
 which don't have a SysRq key at all. In these cases, run ``showkey -s`` to find
 an appropriate scancode sequence, and use ``setkeycodes <sequence> 99`` to map
 this sequence to the usual SysRq code (e.g., ``setkeycodes e05b 99``). It's
--- a/Documentation/arm/mem_alignment
+++ b/Documentation/arm/mem_alignment
@@ -48,7 +48,7 @@ Note that not all combinations are supported - only values 0 through 5.
 For example, the following will turn on the warnings, but without
 fixing up or sending SIGBUS signals:
-	echo 1 > /proc/sys/debug/alignment
+	echo 1 > /proc/cpu/alignment
 You can also read the content of the same file to get statistical
 information on unaligned access occurrences plus the current mode of
--- a/Documentation/arm/stm32/stm32h743-overview.txt
+++ b/Documentation/arm/stm32/stm32h743-overview.txt
@@ -0,0 +1,30 @@
 			STM32H743 Overview
 			==================
  Introduction
  ------------
 	The STM32H743 is a Cortex-M7 MCU aimed at various applications.
 	It features:
 	- Cortex-M7 core running up to @400MHz
 	- 2MB internal flash, 1MBytes internal RAM
 	- FMC controller to connect SDRAM, NOR and NAND memories
 	- Dual mode QSPI
 	- SD/MMC/SDIO support
 	- Ethernet controller
 	- USB OTFG FS & HS controllers
 	- I2C, SPI, CAN busses support
 	- Several 16 & 32 bits general purpose timers
 	- Serial Audio interface
 	- LCD controller
 	- HDMI-CEC
 	- SPDIFRX
 	- DFSDM
  Resources
  ---------
 	Datasheet and reference manual are publicly available on ST website:
 	- http://www.st.com/en/microcontrollers/stm32h7x3.html?querycriteria=productId=LN2033
  Document Author
  ---------------
 	Alexandre Torgue <alexandre.torgue@st.com>
--- a/Documentation/arm64/cpu-feature-registers.txt
+++ b/Documentation/arm64/cpu-feature-registers.txt
@@ -169,6 +169,18 @@ infrastructure:
   as available on the CPU where it is fetched and is not a system
   wide safe value.
  4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1
     x--------------------------------------------------x
     | Name                         |  bits   | visible |
     |--------------------------------------------------|
     | LRCPC                        | [23-20] |    y    |
     |--------------------------------------------------|
     | FCMA                         | [19-16] |    y    |
     |--------------------------------------------------|
     | JSCVT                        | [15-12] |    y    |
     x--------------------------------------------------x
 Appendix I: Example
 ---------------------------
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -54,6 +54,7 @@ stable kernels.
 | ARM            | Cortex-A57      | #852523         | N/A                         |
 | ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220        |
 | ARM            | Cortex-A72      | #853709         | N/A                         |
 | ARM            | Cortex-A73      | #858921         | ARM64_ERRATUM_858921        |
 | ARM            | MMU-500         | #841119,#826419 | N/A                         |
 |                |                 |                 |                             |
 | Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375        |
--- a/Documentation/arm64/tagged-pointers.txt
+++ b/Documentation/arm64/tagged-pointers.txt
@@ -11,24 +11,56 @@ in AArch64 Linux.
 The kernel configures the translation tables so that translations made
 via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of
 the virtual address ignored by the translation hardware. This frees up
-this byte for application use, with the following caveats:
+this byte for application use.
 	(1) The kernel requires that all user addresses passed to EL1
 	    are tagged with tag 0x00. This means that any syscall
 	    parameters containing user virtual addresses *must* have
 	    their top byte cleared before trapping to the kernel.
-	(2) Non-zero tags are not preserved when delivering signals.
+Passing tagged addresses to the kernel
-	    This means that signal handlers in applications making use
+--------------------------------------
 	    of tags cannot rely on the tag information for user virtual
 	    addresses being maintained for fields inside siginfo_t.
 	    One exception to this rule is for signals raised in response
 	    to watchpoint debug exceptions, where the tag information
 	    will be preserved.
-	(3) Special care should be taken when using tagged pointers,
+All interpretation of userspace memory addresses by the kernel assumes
-	    since it is likely that C compilers will not hazard two
+an address tag of 0x00.
-	    virtual addresses differing only in the upper byte.
+
 This includes, but is not limited to, addresses found in:
 - pointer arguments to system calls, including pointers in structures
   passed to system calls,
 - the stack pointer (sp), e.g. when interpreting it to deliver a
   signal,
 - the frame pointer (x29) and frame records, e.g. when interpreting
   them to generate a backtrace or call graph.
 Using non-zero address tags in any of these locations may result in an
 error code being returned, a (fatal) signal being raised, or other modes
 of failure.
 For these reasons, passing non-zero address tags to the kernel via
 system calls is forbidden, and using a non-zero address tag for sp is
 strongly discouraged.
 Programs maintaining a frame pointer and frame records that use non-zero
 address tags may suffer impaired or inaccurate debug and profiling
 visibility.
 Preserving tags
 ---------------
 Non-zero tags are not preserved when delivering signals. This means that
 signal handlers in applications making use of tags cannot rely on the
 tag information for user virtual addresses being maintained for fields
 inside siginfo_t. One exception to this rule is for signals raised in
 response to watchpoint debug exceptions, where the tag information will
 be preserved.
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
 Other considerations
 --------------------
 Special care should be taken when using tagged pointers, since it is
 likely that C compilers will not hazard two virtual addresses differing
 only in the upper byte.
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
 00-INDEX
 	- This file
 bfq-iosched.txt
 	- BFQ IO scheduler and its tunables
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,546 @@
 BFQ (Budget Fair Queueing)
 ==========================
 BFQ is a proportional-share I/O scheduler, with some extra
 low-latency capabilities. In addition to cgroups support (blkio or io
 controllers), BFQ's main features are:
 - BFQ guarantees a high system and application responsiveness, and a
  low latency for time-sensitive applications, such as audio or video
  players;
 - BFQ distributes bandwidth, and not just time, among processes or
  groups (switching back to time distribution when needed to keep
  throughput high).
 In its default configuration, BFQ privileges latency over
 throughput. So, when needed for achieving a lower latency, BFQ builds
 schedules that may lead to a lower throughput. If your main or only
 goal, for a given device, is to achieve the maximum-possible
 throughput at all times, then do switch off all low-latency heuristics
 for that device, by setting low_latency to 0. Full details in Section 3.
 On average CPUs, the current version of BFQ can handle devices
 performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
 reference, 30-50 KIOPS correspond to very high bandwidths with
 sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
 to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
 multi-queue devices.
 The table of contents follow. Impatients can just jump to Section 3.
 CONTENTS
 1. When may BFQ be useful?
 1-1 Personal systems
 1-2 Server systems
 2. How does BFQ work?
 3. What are BFQ's tunable?
 4. BFQ group scheduling
 4-1 Service guarantees provided
 4-2 Interface
 1. When may BFQ be useful?
 ==========================
 BFQ provides the following benefits on personal and server systems.
 1-1 Personal systems
 --------------------
 Low latency for interactive applications
 Regardless of the actual background workload, BFQ guarantees that, for
 interactive tasks, the storage device is virtually as responsive as if
 it was idle. For example, even if one or more of the following
 background workloads are being executed:
 - one or more large files are being read, written or copied,
 - a tree of source files is being compiled,
 - one or more virtual machines are performing I/O,
 - a software update is in progress,
 - indexing daemons are scanning filesystems and updating their
  databases,
 starting an application or loading a file from within an application
 takes about the same time as if the storage device was idle. As a
 comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
 applications experience high latencies, or even become unresponsive
 until the background workload terminates (also on SSDs).
 Low latency for soft real-time applications
 Also soft real-time applications, such as audio and video
 players/streamers, enjoy a low latency and a low drop rate, regardless
 of the background I/O workload. As a consequence, these applications
 do not suffer from almost any glitch due to the background workload.
 Higher speed for code-development tasks
 If some additional workload happens to be executed in parallel, then
 BFQ executes the I/O-related components of typical code-development
 tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
 NOOP or DEADLINE.
 High throughput
 On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
 up to 150% higher throughput than DEADLINE and NOOP, with all the
 sequential workloads considered in our tests. With random workloads,
 and with all the workloads on flash-based devices, BFQ achieves,
 instead, about the same throughput as the other schedulers.
 Strong fairness, bandwidth and delay guarantees
 BFQ distributes the device throughput, and not just the device time,
 among I/O-bound applications in proportion their weights, with any
 workload and regardless of the device parameters. From these bandwidth
 guarantees, it is possible to compute tight per-I/O-request delay
 guarantees by a simple formula. If not configured for strict service
 guarantees, BFQ switches to time-based resource sharing (only) for
 applications that would otherwise cause a throughput loss.
 1-2 Server systems
 ------------------
 Most benefits for server systems follow from the same service
 properties as above. In particular, regardless of whether additional,
 possibly heavy workloads are being served, BFQ guarantees:
 . audio and video-streaming with zero or very low jitter and drop
  rate;
 . fast retrieval of WEB pages and embedded objects;
 . real-time recording of data in live-dumping applications (e.g.,
  packet logging);
 . responsiveness in local and remote access to a server.
 2. How does BFQ work?
 =====================
 BFQ is a proportional-share I/O scheduler, whose general structure,
 plus a lot of code, are borrowed from CFQ.
 - Each process doing I/O on a device is associated with a weight and a
  (bfq_)queue.
 - BFQ grants exclusive access to the device, for a while, to one queue
  (process) at a time, and implements this service model by
  associating every queue with a budget, measured in number of
  sectors.
  - After a queue is granted access to the device, the budget of the
    queue is decremented, on each request dispatch, by the size of the
    request.
  - The in-service queue is expired, i.e., its service is suspended,
    only if one of the following events occurs: 1) the queue finishes
    its budget, 2) the queue empties, 3) a "budget timeout" fires.
    - The budget timeout prevents processes doing random I/O from
      holding the device for too long and dramatically reducing
      throughput.
    - Actually, as in CFQ, a queue associated with a process issuing
      sync requests may not be expired immediately when it empties. In
      contrast, BFQ may idle the device for a short time interval,
      giving the process the chance to go on being served if it issues
      a new request in time. Device idling typically boosts the
      throughput on rotational devices, if processes do synchronous
      and sequential I/O. In addition, under BFQ, device idling is
      also instrumental in guaranteeing the desired throughput
      fraction to processes issuing sync requests (see the description
      of the slice_idle tunable in this document, or [1, 2], for more
      details).
      - With respect to idling for service guarantees, if several
 	processes are competing for the device at the same time, but
 	all processes (and groups, after the following commit) have
 	the same weight, then BFQ guarantees the expected throughput
 	distribution without ever idling the device. Throughput is
 	thus as high as possible in this common scenario.
  - If low-latency mode is enabled (default configuration), BFQ
    executes some special heuristics to detect interactive and soft
    real-time applications (e.g., video or audio players/streamers),
    and to reduce their latency. The most important action taken to
    achieve this goal is to give to the queues associated with these
    applications more than their fair share of the device
    throughput. For brevity, we call just "weight-raising" the whole
    sets of actions taken by BFQ to privilege these queues. In
    particular, BFQ provides a milder form of weight-raising for
    interactive applications, and a stronger form for soft real-time
    applications.
  - BFQ automatically deactivates idling for queues born in a burst of
    queue creations. In fact, these queues are usually associated with
    the processes of applications and services that benefit mostly
    from a high throughput. Examples are systemd during boot, or git
    grep.
  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
    performing random I/O that becomes mostly sequential if
    merged. Differently from CFQ, BFQ achieves this goal with a more
    reactive mechanism, called Early Queue Merge (EQM). EQM is so
    responsive in detecting interleaved I/O (cooperating processes),
    that it enables BFQ to achieve a high throughput, by queue
    merging, even for queues for which CFQ needs a different
    mechanism, preemption, to get a high throughput. As such EQM is a
    unified mechanism to achieve a high throughput with interleaved
    I/O.
  - Queues are scheduled according to a variant of WF2Q+, named
    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
    also ready for hierarchical scheduling. However, for a cleaner
    logical breakdown, the code that enables and completes
    hierarchical support is provided in the next commit, which focuses
    exactly on this feature.
  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
    perfectly fair, and smooth service. In particular, B-WF2Q+
    guarantees that each queue receives a fraction of the device
    throughput proportional to its weight, even if the throughput
    fluctuates, and regardless of: the device parameters, the current
    workload and the budgets assigned to the queue.
  - The last, budget-independence, property (although probably
    counterintuitive in the first place) is definitely beneficial, for
    the following reasons:
    - First, with any proportional-share scheduler, the maximum
      deviation with respect to an ideal service is proportional to
      the maximum budget (slice) assigned to queues. As a consequence,
      BFQ can keep this deviation tight not only because of the
      accurate service of B-WF2Q+, but also because BFQ *does not*
      need to assign a larger budget to a queue to let the queue
      receive a higher fraction of the device throughput.
    - Second, BFQ is free to choose, for every process (queue), the
      budget that best fits the needs of the process, or best
      leverages the I/O pattern of the process. In particular, BFQ
      updates queue budgets with a simple feedback-loop algorithm that
      allows a high throughput to be achieved, while still providing
      tight latency guarantees to time-sensitive applications. When
      the in-service queue expires, this algorithm computes the next
      budget of the queue so as to:
      - Let large budgets be eventually assigned to the queues
 	associated with I/O-bound applications performing sequential
 	I/O: in fact, the longer these applications are served once
 	got access to the device, the higher the throughput is.
      - Let small budgets be eventually assigned to the queues
 	associated with time-sensitive applications (which typically
 	perform sporadic and short I/O), because, the smaller the
 	budget assigned to a queue waiting for service is, the sooner
 	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
 - If several processes are competing for the device at the same time,
  but all processes and groups have the same weight, then BFQ
  guarantees the expected throughput distribution without ever idling
  the device. It uses preemption instead. Throughput is then much
  higher in this common scenario.
 - ioprio classes are served in strict priority order, i.e.,
  lower-priority queues are not served as long as there are
  higher-priority queues.  Among queues in the same class, the
  bandwidth is distributed in proportion to the weight of each
  queue. A very thin extra bandwidth is however guaranteed to
  the Idle class, to prevent it from starving.
 3. What are BFQ's tunable?
 ==========================
 The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
 fifo_expire_sync below are the same as in CFQ. Their description is
 just copied from that for CFQ. Some considerations in the description
 of slice_idle are copied from CFQ too.
 per-process ioprio and weight
 -----------------------------
 Unless the cgroups interface is used (see "4. BFQ group scheduling"),
 weights can be assigned to processes only indirectly, through I/O
 priorities, and according to the relation:
 weight = (IOPRIO_BE_NR - ioprio) * 10.
 Beware that, if low-latency is set, then BFQ automatically raises the
 weight of the queues associated with interactive and soft real-time
 applications. Unset this tunable if you need/want to control weights.
 slice_idle
 ----------
 This parameter specifies how long BFQ should idle for next I/O
 request, when certain sync BFQ queues become empty. By default
 slice_idle is a non-zero value. Idling has a double purpose: boosting
 throughput and making sure that the desired throughput distribution is
 respected (see the description of how BFQ works, and, if needed, the
 papers referred there).
 As for throughput, idling can be very helpful on highly seeky media
 like single spindle SATA/SAS disks where we can cut down on overall
 number of seeks and see improved throughput.
 Setting slice_idle to 0 will remove all the idling on queues and one
 should see an overall improved throughput on faster storage devices
 like multiple SATA/SAS disks in hardware RAID configuration.
 So depending on storage and workload, it might be useful to set
 slice_idle=0.  In general for SATA/SAS disks and software RAID of
 SATA/SAS disks keeping slice_idle enabled should be useful. For any
 configurations where there are multiple spindles behind single LUN
 (Host based hardware RAID controller or for storage arrays), setting
 slice_idle=0 might end up in better throughput and acceptable
 latencies.
 Idling is however necessary to have service guarantees enforced in
 case of differentiated weights or differentiated I/O-request lengths.
 To see why, suppose that a given BFQ queue A must get several I/O
 requests served for each request served for another queue B. Idling
 ensures that, if A makes a new I/O request slightly after becoming
 empty, then no request of B is dispatched in the middle, and thus A
 does not lose the possibility to get more than one request dispatched
 before the next request of B is dispatched. Note that idling
 guarantees the desired differentiated treatment of queues only in
 terms of I/O-request dispatches. To guarantee that the actual service
 order then corresponds to the dispatch order, the strict_guarantees
 tunable must be set too.
 There is an important flipside for idling: apart from the above cases
 where it is beneficial also for throughput, idling can severely impact
 throughput. One important case is random workload. Because of this
 issue, BFQ tends to avoid idling as much as possible, when it is not
 beneficial also for throughput. As a consequence of this behavior, and
 of further issues described for the strict_guarantees tunable,
 short-term service guarantees may be occasionally violated. And, in
 some cases, these guarantees may be more important than guaranteeing
 maximum throughput. For example, in video playing/streaming, a very
 low drop rate may be more important than maximum throughput. In these
 cases, consider setting the strict_guarantees parameter.
 strict_guarantees
 -----------------
 If this parameter is set (default: unset), then BFQ
 - always performs idling when the in-service queue becomes empty;
 - forces the device to serve one I/O request at a time, by dispatching a
  new request only if there is no outstanding request.
 In the presence of differentiated weights or I/O-request sizes, both
 the above conditions are needed to guarantee that every BFQ queue
 receives its allotted share of the bandwidth. The first condition is
 needed for the reasons explained in the description of the slice_idle
 tunable.  The second condition is needed because all modern storage
 devices reorder internally-queued requests, which may trivially break
 the service guarantees enforced by the I/O scheduler.
 Setting strict_guarantees may evidently affect throughput.
 back_seek_max
 -------------
 This specifies, given in Kbytes, the maximum "distance" for backward seeking.
 The distance is the amount of space from the current head location to the
 sectors that are backward in terms of distance.
 This parameter allows the scheduler to anticipate requests in the "backward"
 direction and consider them as being the "next" if they are within this
 distance from the current head location.
 back_seek_penalty
 -----------------
 This parameter is used to compute the cost of backward seeking. If the
 backward distance of request is just 1/back_seek_penalty from a "front"
 request, then the seeking cost of two requests is considered equivalent.
 So scheduler will not bias toward one or the other request (otherwise scheduler
 will bias toward front request). Default value of back_seek_penalty is 2.
 fifo_expire_async
 -----------------
 This parameter is used to set the timeout of asynchronous requests. Default
 value of this is 248ms.
 fifo_expire_sync
 ----------------
 This parameter is used to set the timeout of synchronous requests. Default
 value of this is 124ms. In case to favor synchronous requests over asynchronous
 one, this value should be decreased relative to fifo_expire_async.
 low_latency
 -----------
 This parameter is used to enable/disable BFQ's low latency mode. By
 default, low latency mode is enabled. If enabled, interactive and soft
 real-time applications are privileged and experience a lower latency,
 as explained in more detail in the description of how BFQ works.
 DISABLE this mode if you need full control on bandwidth
 distribution. In fact, if it is enabled, then BFQ automatically
 increases the bandwidth share of privileged applications, as the main
 means to guarantee a lower latency to them.
 In addition, as already highlighted at the beginning of this document,
 DISABLE this mode if your only goal is to achieve a high throughput.
 In fact, privileging the I/O of some application over the rest may
 entail a lower throughput. To achieve the highest-possible throughput
 on a non-rotational device, setting slice_idle to 0 may be needed too
 (at the cost of giving up any strong guarantee on fairness and low
 latency).
 timeout_sync
 ------------
 Maximum amount of device time that can be given to a task (queue) once
 it has been selected for service. On devices with costly seeks,
 increasing this time usually increases maximum throughput. On the
 opposite end, increasing this time coarsens the granularity of the
 short-term bandwidth and latency guarantees, especially if the
 following parameter is set to zero.
 max_budget
 ----------
 Maximum amount of service, measured in sectors, that can be provided
 to a BFQ queue once it is set in service (of course within the limits
 of the above timeout). According to what said in the description of
 the algorithm, larger values increase the throughput in proportion to
 the percentage of sequential I/O requests issued. The price of larger
 values is that they coarsen the granularity of short-term bandwidth
 and latency guarantees.
 The default value is 0, which enables auto-tuning: BFQ sets max_budget
 to the maximum number of sectors that can be served during
 timeout_sync, according to the estimated peak rate.
 weights
 -------
 Read-only parameter, used to show the weights of the currently active
 BFQ queues.
 wr_ tunables
 ------------
 BFQ exports a few parameters to control/tune the behavior of
 low-latency heuristics.
 wr_coeff
 Factor by which the weight of a weight-raised queue is multiplied. If
 the queue is deemed soft real-time, then the weight is further
 multiplied by an additional, constant factor.
 wr_max_time
 Maximum duration of a weight-raising period for an interactive task
 (ms). If set to zero (default value), then this value is computed
 automatically, as a function of the peak rate of the device. In any
 case, when the value of this parameter is read, it always reports the
 current duration, regardless of whether it has been set manually or
 computed automatically.
 wr_max_softrt_rate
 Maximum service rate below which a queue is deemed to be associated
 with a soft real-time application, and is then weight-raised
 accordingly (sectors/sec).
 wr_min_idle_time
 Minimum idle period after which interactive weight-raising may be
 reactivated for a queue (in ms).
 wr_rt_max_time
 Maximum weight-raising duration for soft real-time queues (in ms). The
 start time from which this duration is considered is automatically
 moved forward if the queue is detected to be still soft real-time
 before the current soft real-time weight-raising period finishes.
 wr_min_inter_arr_async
 Minimum period between I/O request arrivals after which weight-raising
 may be reactivated for an already busy async queue (in ms).
 4. Group scheduling with BFQ
 ============================
 BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
 blkio and io. In particular, BFQ supports weight-based proportional
 share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
 4-1 Service guarantees provided
 -------------------------------
 With BFQ, proportional share means true proportional share of the
 device bandwidth, according to group weights. For example, a group
 with weight 200 gets twice the bandwidth, and not just twice the time,
 of a group with weight 100.
 BFQ supports hierarchies (group trees) of any depth. Bandwidth is
 distributed among groups and processes in the expected way: for each
 group, the children of the group share the whole bandwidth of the
 group in proportion to their weights. In particular, this implies
 that, for each leaf group, every process of the group receives the
 same share of the whole group bandwidth, unless the ioprio of the
 process is modified.
 The resource-sharing guarantee for a group may partially or totally
 switch from bandwidth to time, if providing bandwidth guarantees to
 the group lowers the throughput too much. This switch occurs on a
 per-process basis: if a process of a leaf group causes throughput loss
 if served in such a way to receive its share of the bandwidth, then
 BFQ switches back to just time-based proportional share for that
 process.
 4-2 Interface
 -------------
 To get proportional sharing of bandwidth with BFQ for a given device,
 BFQ must of course be the active scheduler for that device.
 Within each group directory, the names of the files associated with
 BFQ-specific cgroup parameters and stats begin with the "bfq."
 prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
 BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
 parameter to set the weight of a group with BFQ is blkio.bfq.weight
 or io.bfq.weight.
 Parameters to set
 -----------------
 For each group, there is only the following parameter to set.
 weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
 group inside its parent. Available values: 1..10000 (default 100). The
 linear mapping between ioprio and weights, described at the beginning
 of the tunable section, is still valid, but all weights higher than
 IOPRIO_BE_NR*10 are mapped to ioprio 0.
 Recall that, if low-latency is set, then BFQ automatically raises the
 weight of the queues associated with interactive and soft real-time
 applications. Unset this tunable if you need/want to control weights.
 [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
    Scheduler", Proceedings of the First Workshop on Mobile System
    Technologies (MST-2015), May 2015.
    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
 [2] P. Valente and M. Andreolini, "Improving Application
    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
    the 5th Annual International Systems and Storage Conference
    (SYSTOR '12), June 2012.
    Slightly extended version:
    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
 							results.pdf
--- a/Documentation/block/kyber-iosched.txt
+++ b/Documentation/block/kyber-iosched.txt
@@ -0,0 +1,14 @@
 Kyber I/O scheduler tunables
 ===========================
 The only two tunables for the Kyber scheduler are the target latencies for
 reads and synchronous writes. Kyber will throttle requests in order to meet
 these target latencies.
 read_lat_nsec
 -------------
 Target latency for reads (in nanoseconds).
 write_lat_nsec
 --------------
 Target latency for synchronous writes (in nanoseconds).
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 discard_zeroes_data (RO)
 ------------------------
 When read, this file will show if the discarded block are zeroed by the
 device or not. If its value is '1' the blocks are zeroed otherwise not.
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
@@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the
 feature. Writing a value of '-1' to this file resets the value to the
 default setting.
 throttle_sample_time (RW)
 -------------------------
 This is the time window that blk-throttle samples data, in millisecond.
 blk-throttle makes decision based on the samplings. Lower time means cgroups
 have more smooth throughput, but higher CPU overhead. This exists only when
 CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
 Jens Axboe <jens.axboe@oracle.com>, February 2009
--- a/Documentation/blockdev/mflash.txt
+++ b/Documentation/blockdev/mflash.txt
@@ -1,84 +0,0 @@
 This document describes m[g]flash support in linux.
 Contents
  1. Overview
  2. Reserved area configuration
  3. Example of mflash platform driver registration
 1. Overview
 Mflash and gflash are embedded flash drive. The only difference is mflash is
 MCP(Multi Chip Package) device. These two device operate exactly same way.
 So the rest mflash repersents mflash and gflash altogether.
 Internally, mflash has nand flash and other hardware logics and supports
 2 different operation (ATA, IO) modes. ATA mode doesn't need any new
 driver and currently works well under standard IDE subsystem. Actually it's
 one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
 IDE interface.
 Following are brief descriptions about IO mode.
 A. IO mode based on ATA protocol and uses some custom command. (read confirm,
 write confirm)
 B. IO mode uses SRAM bus interface.
 C. IO mode supports 4kB boot area, so host can boot from mflash.
 2. Reserved area configuration
 If host boot from mflash, usually needs raw area for boot loader image. All of
 the mflash's block device operation will be taken this value as start offset.
 Note that boot loader's size of reserved area and kernel configuration value
 must be same.
 3. Example of mflash platform driver registration
 Working mflash is very straight forward. Adding platform device stuff to board
 configuration file is all. Here is some pseudo example.
 static struct mg_drv_data mflash_drv_data = {
 	/* If you want to polling driver set to 1 */
 	.use_polling = 0,
 	/* device attribution */
 	.dev_attr = MG_BOOT_DEV
 };
 static struct resource mg_mflash_rsc[] = {
 	/* Base address of mflash */
 	[0] = {
 		.start = 0x08000000,
 		.end = 0x08000000 + SZ_64K - 1,
 		.flags = IORESOURCE_MEM
 	},
 	/* mflash interrupt pin */
 	[1] = {
 		.start = IRQ_GPIO(84),
 		.end = IRQ_GPIO(84),
 		.flags = IORESOURCE_IRQ
 	},
 	/* mflash reset pin */
 	[2] = {
 		.start = 43,
 		.end = 43,
 		.name = MG_RST_PIN,
 		.flags = IORESOURCE_IO
 	},
 	/* mflash reset-out pin
 	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
 	 * should assign this */
 	[3] = {
 		.start = 51,
 		.end = 51,
 		.name = MG_RSTOUT_PIN,
 		.flags = IORESOURCE_IO
 	}
 };
 static struct platform_device mflash_dev = {
 	.name = MG_DEV_NAME,
 	.id = -1,
 	.dev = {
 		.platform_data = &mflash_drv_data,
 	},
 	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
 	.resource = mg_mflash_rsc
 };
 platform_device_register(&mflash_dev);
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
 		Amount of memory used in network transmission buffers
 	  shmem
 		Amount of cached filesystem data that is swap-backed,
 		such as tmpfs, shm segments, shared anonymous mmap()s
 	  file_mapped
 		Amount of cached filesystem data mapped with mmap()
@@ -913,6 +918,18 @@ PAGE_SIZE multiple when read back.
 		Number of major page faults incurred
 	  workingset_refault
 		Number of refaults of previously evicted pages
 	  workingset_activate
 		Number of refaulted pages that were immediately activated
 	  workingset_nodereclaim
 		Number of times a shadow node has been reclaimed
  memory.swap.current
 	A read-only single value file which exists on non-root
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -17,7 +17,7 @@ import os
 import sphinx
 # Get Sphinx version
-major, minor, patch = map(int, sphinx.__version__.split("."))
+major, minor, patch = sphinx.version_info[:3]
 # If extensions (or modules to document with autodoc) are in another directory,
@@ -29,12 +29,12 @@ from load_config import loadConfig
 # -- General configuration ------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+needs_sphinx = '1.2'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain']
+extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure']
 # The name of the math extension changed on Sphinx 1.4
 if major == 1 and minor > 3:
@@ -348,6 +348,8 @@ latex_documents = [
     'The kernel development community', 'manual'),
    ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
     'The kernel development community', 'manual'),
    ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
     'The kernel development community', 'manual'),
    ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
     'The kernel development community', 'manual'),
    ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
--- a/Documentation/core-api/flexible-arrays.rst
+++ b/Documentation/core-api/flexible-arrays.rst
@@ -0,0 +1,130 @@
 ===================================
 Using flexible arrays in the kernel
 ===================================
 Large contiguous memory allocations can be unreliable in the Linux kernel.
 Kernel programmers will sometimes respond to this problem by allocating
 pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
 systems, memory from vmalloc() must be mapped into a relatively small address
 space; it's easy to run out.  On SMP systems, the page table changes required
 by vmalloc() allocations can require expensive cross-processor interrupts on
 all CPUs.  And, on all systems, use of space in the vmalloc() range increases
 pressure on the translation lookaside buffer (TLB), reducing the performance
 of the system.
 In many cases, the need for memory from vmalloc() can be eliminated by piecing
 together an array from smaller parts; the flexible array library exists to make
 this task easier.
 A flexible array holds an arbitrary (within limits) number of fixed-sized
 objects, accessed via an integer index.  Sparse arrays are handled
 reasonably well.  Only single-page allocations are made, so memory
 allocation failures should be relatively rare.  The down sides are that the
 arrays cannot be indexed directly, individual object size cannot exceed the
 system page size, and putting data into a flexible array requires a copy
 operation.  It's also worth noting that flexible arrays do no internal
 locking at all; if concurrent access to an array is possible, then the
 caller must arrange for appropriate mutual exclusion.
 The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
    #include <linux/flex_array.h>
    struct flex_array *flex_array_alloc(int element_size,
 					unsigned int total,
 					gfp_t flags);
 The individual object size is provided by ``element_size``, while total is the
 maximum number of objects which can be stored in the array.  The flags
 argument is passed directly to the internal memory allocation calls.  With
 the current code, using flags to ask for high memory is likely to lead to
 notably unpleasant side effects.
 It is also possible to define flexible arrays at compile time with::
    DEFINE_FLEX_ARRAY(name, element_size, total);
 This macro will result in a definition of an array with the given name; the
 element size and total will be checked for validity at compile time.
 Storing data into a flexible array is accomplished with a call to
 :c:func:`flex_array_put()`::
    int flex_array_put(struct flex_array *array, unsigned int element_nr,
    		       void *src, gfp_t flags);
 This call will copy the data from src into the array, in the position
 indicated by ``element_nr`` (which must be less than the maximum specified when
 the array was created).  If any memory allocations must be performed, flags
 will be used.  The return value is zero on success, a negative error code
 otherwise.
 There might possibly be a need to store data into a flexible array while
 running in some sort of atomic context; in this situation, sleeping in the
 memory allocator would be a bad thing.  That can be avoided by using
 ``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
 trick is to ensure that any needed memory allocations are done before
 entering atomic context, using :c:func:`flex_array_prealloc()`::
    int flex_array_prealloc(struct flex_array *array, unsigned int start,
 			    unsigned int nr_elements, gfp_t flags);
 This function will ensure that memory for the elements indexed in the range
 defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
 ``flex_array_put()`` call on an element in that range is guaranteed not to
 block.
 Getting data back out of the array is done with :c:func:`flex_array_get()`::
    void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
 The return value is a pointer to the data element, or NULL if that
 particular element has never been allocated.
 Note that it is possible to get back a valid pointer for an element which
 has never been stored in the array.  Memory for array elements is allocated
 one page at a time; a single allocation could provide memory for several
 adjacent elements.  Flexible array elements are normally initialized to the
 value ``FLEX_ARRAY_FREE`` (defined as 0x6c in <linux/poison.h>), so errors
 involving that number probably result from use of unstored array entries.
 Note that, if array elements are allocated with ``__GFP_ZERO``, they will be
 initialized to zero and this poisoning will not happen.
 Individual elements in the array can be cleared with
 :c:func:`flex_array_clear()`::
    int flex_array_clear(struct flex_array *array, unsigned int element_nr);
 This function will set the given element to ``FLEX_ARRAY_FREE`` and return
 zero.  If storage for the indicated element is not allocated for the array,
 ``flex_array_clear()`` will return ``-EINVAL`` instead.  Note that clearing an
 element does not release the storage associated with it; to reduce the
 allocated size of an array, call :c:func:`flex_array_shrink()`::
    int flex_array_shrink(struct flex_array *array);
 The return value will be the number of pages of memory actually freed.
 This function works by scanning the array for pages containing nothing but
 ``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work
 if the array's pages are allocated with ``__GFP_ZERO``.
 It is possible to remove all elements of an array with a call to
 :c:func:`flex_array_free_parts()`::
    void flex_array_free_parts(struct flex_array *array);
 This call frees all elements, but leaves the array itself in place.
 Freeing the entire array is done with :c:func:`flex_array_free()`::
    void flex_array_free(struct flex_array *array);
 As of this writing, there are no users of flexible arrays in the mainline
 kernel.  The functions described here are also not exported to modules;
 that will probably be fixed when somebody comes up with a need for it.
 Flexible array functions
 ------------------------
 .. kernel-doc:: include/linux/flex_array.h
--- a/Documentation/core-api/genericirq.rst
+++ b/Documentation/core-api/genericirq.rst
@@ -0,0 +1,440 @@
 .. include:: <isonum.txt>
 ==========================
 Linux generic IRQ handling
 ==========================
 :Copyright: |copy| 2005-2010: Thomas Gleixner
 :Copyright: |copy| 2005-2006:  Ingo Molnar
 Introduction
 ============
 The generic interrupt handling layer is designed to provide a complete
 abstraction of interrupt handling for device drivers. It is able to
 handle all the different types of interrupt controller hardware. Device
 drivers use generic API functions to request, enable, disable and free
 interrupts. The drivers do not have to know anything about interrupt
 hardware details, so they can be used on different platforms without
 code changes.
 This documentation is provided to developers who want to implement an
 interrupt subsystem based for their architecture, with the help of the
 generic IRQ handling layer.
 Rationale
 =========
 The original implementation of interrupt handling in Linux uses the
 :c:func:`__do_IRQ` super-handler, which is able to deal with every type of
 interrupt logic.
 Originally, Russell King identified different types of handlers to build
 a quite universal set for the ARM interrupt handler implementation in
 Linux 2.5/2.6. He distinguished between:
 -  Level type
 -  Edge type
 -  Simple type
 During the implementation we identified another type:
 -  Fast EOI type
 In the SMP world of the :c:func:`__do_IRQ` super-handler another type was
 identified:
 -  Per CPU type
 This split implementation of high-level IRQ handlers allows us to
 optimize the flow of the interrupt handling for each specific interrupt
 type. This reduces complexity in that particular code path and allows
 the optimized handling of a given type.
 The original general IRQ implementation used hw_interrupt_type
 structures and their ``->ack``, ``->end`` [etc.] callbacks to differentiate
 the flow control in the super-handler. This leads to a mix of flow logic
 and low-level hardware logic, and it also leads to unnecessary code
 duplication: for example in i386, there is an ``ioapic_level_irq`` and an
 ``ioapic_edge_irq`` IRQ-type which share many of the low-level details but
 have different flow handling.
 A more natural abstraction is the clean separation of the 'irq flow' and
 the 'chip details'.
 Analysing a couple of architecture's IRQ subsystem implementations
 reveals that most of them can use a generic set of 'irq flow' methods
 and only need to add the chip-level specific code. The separation is
 also valuable for (sub)architectures which need specific quirks in the
 IRQ flow itself but not in the chip details - and thus provides a more
 transparent IRQ subsystem design.
 Each interrupt descriptor is assigned its own high-level flow handler,
 which is normally one of the generic implementations. (This high-level
 flow handler implementation also makes it simple to provide
 demultiplexing handlers which can be found in embedded platforms on
 various architectures.)
 The separation makes the generic interrupt handling layer more flexible
 and extensible. For example, an (sub)architecture can use a generic
 IRQ-flow implementation for 'level type' interrupts and add a
 (sub)architecture specific 'edge type' implementation.
 To make the transition to the new model easier and prevent the breakage
 of existing implementations, the :c:func:`__do_IRQ` super-handler is still
 available. This leads to a kind of duality for the time being. Over time
 the new model should be used in more and more architectures, as it
 enables smaller and cleaner IRQ subsystems. It's deprecated for three
 years now and about to be removed.
 Known Bugs And Assumptions
 ==========================
 None (knock on wood).
 Abstraction layers
 ==================
 There are three main levels of abstraction in the interrupt code:
 1. High-level driver API
 2. High-level IRQ flow handlers
 3. Chip-level hardware encapsulation
 Interrupt control flow
 ----------------------
 Each interrupt is described by an interrupt descriptor structure
 irq_desc. The interrupt is referenced by an 'unsigned int' numeric
 value which selects the corresponding interrupt description structure in
 the descriptor structures array. The descriptor structure contains
 status information and pointers to the interrupt flow method and the
 interrupt chip structure which are assigned to this interrupt.
 Whenever an interrupt triggers, the low-level architecture code calls
 into the generic interrupt code by calling :c:func:`desc->handle_irq`. This
 high-level IRQ handling function only uses desc->irq_data.chip
 primitives referenced by the assigned chip descriptor structure.
 High-level Driver API
 ---------------------
 The high-level Driver API consists of following functions:
 -  :c:func:`request_irq`
 -  :c:func:`free_irq`
 -  :c:func:`disable_irq`
 -  :c:func:`enable_irq`
 -  :c:func:`disable_irq_nosync` (SMP only)
 -  :c:func:`synchronize_irq` (SMP only)
 -  :c:func:`irq_set_irq_type`
 -  :c:func:`irq_set_irq_wake`
 -  :c:func:`irq_set_handler_data`
 -  :c:func:`irq_set_chip`
 -  :c:func:`irq_set_chip_data`
 See the autogenerated function documentation for details.
 High-level IRQ flow handlers
 ----------------------------
 The generic layer provides a set of pre-defined irq-flow methods:
 -  :c:func:`handle_level_irq`
 -  :c:func:`handle_edge_irq`
 -  :c:func:`handle_fasteoi_irq`
 -  :c:func:`handle_simple_irq`
 -  :c:func:`handle_percpu_irq`
 -  :c:func:`handle_edge_eoi_irq`
 -  :c:func:`handle_bad_irq`
 The interrupt flow handlers (either pre-defined or architecture
 specific) are assigned to specific interrupts by the architecture either
 during bootup or during device initialization.
 Default flow implementations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Helper functions
 ^^^^^^^^^^^^^^^^
 The helper functions call the chip primitives and are used by the
 default flow implementations. The following helper functions are
 implemented (simplified excerpt)::
    default_enable(struct irq_data *data)
    {
        desc->irq_data.chip->irq_unmask(data);
    }
    default_disable(struct irq_data *data)
    {
        if (!delay_disable(data))
            desc->irq_data.chip->irq_mask(data);
    }
    default_ack(struct irq_data *data)
    {
        chip->irq_ack(data);
    }
    default_mask_ack(struct irq_data *data)
    {
        if (chip->irq_mask_ack) {
            chip->irq_mask_ack(data);
        } else {
            chip->irq_mask(data);
            chip->irq_ack(data);
        }
    }
    noop(struct irq_data *data))
    {
    }
 Default flow handler implementations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Default Level IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_level_irq provides a generic implementation for level-triggered
 interrupts.
 The following control flow is implemented (simplified excerpt)::
    :c:func:`desc->irq_data.chip->irq_mask_ack`;
    handle_irq_event(desc->action);
    :c:func:`desc->irq_data.chip->irq_unmask`;
 Default Fast EOI IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_fasteoi_irq provides a generic implementation for interrupts,
 which only need an EOI at the end of the handler.
 The following control flow is implemented (simplified excerpt)::
    handle_irq_event(desc->action);
    :c:func:`desc->irq_data.chip->irq_eoi`;
 Default Edge IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_edge_irq provides a generic implementation for edge-triggered
 interrupts.
 The following control flow is implemented (simplified excerpt)::
    if (desc->status & running) {
        :c:func:`desc->irq_data.chip->irq_mask_ack`;
        desc->status |= pending | masked;
        return;
    }
    :c:func:`desc->irq_data.chip->irq_ack`;
    desc->status |= running;
    do {
        if (desc->status & masked)
            :c:func:`desc->irq_data.chip->irq_unmask`;
        desc->status &= ~pending;
        handle_irq_event(desc->action);
    } while (status & pending);
    desc->status &= ~running;
 Default simple IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_simple_irq provides a generic implementation for simple
 interrupts.
 .. note::
   The simple flow handler does not call any handler/chip primitives.
 The following control flow is implemented (simplified excerpt)::
    handle_irq_event(desc->action);
 Default per CPU flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_percpu_irq provides a generic implementation for per CPU
 interrupts.
 Per CPU interrupts are only available on SMP and the handler provides a
 simplified version without locking.
 The following control flow is implemented (simplified excerpt)::
    if (desc->irq_data.chip->irq_ack)
        :c:func:`desc->irq_data.chip->irq_ack`;
    handle_irq_event(desc->action);
    if (desc->irq_data.chip->irq_eoi)
            :c:func:`desc->irq_data.chip->irq_eoi`;
 EOI Edge IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 handle_edge_eoi_irq provides an abnomination of the edge handler
 which is solely used to tame a badly wreckaged irq controller on
 powerpc/cell.
 Bad IRQ flow handler
 ^^^^^^^^^^^^^^^^^^^^
 handle_bad_irq is used for spurious interrupts which have no real
 handler assigned..
 Quirks and optimizations
 ~~~~~~~~~~~~~~~~~~~~~~~~
 The generic functions are intended for 'clean' architectures and chips,
 which have no platform-specific IRQ handling quirks. If an architecture
 needs to implement quirks on the 'flow' level then it can do so by
 overriding the high-level irq-flow handler.
 Delayed interrupt disable
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 This per interrupt selectable feature, which was introduced by Russell
 King in the ARM interrupt implementation, does not mask an interrupt at
 the hardware level when :c:func:`disable_irq` is called. The interrupt is kept
 enabled and is masked in the flow handler when an interrupt event
 happens. This prevents losing edge interrupts on hardware which does not
 store an edge interrupt event while the interrupt is disabled at the
 hardware level. When an interrupt arrives while the IRQ_DISABLED flag
 is set, then the interrupt is masked at the hardware level and the
 IRQ_PENDING bit is set. When the interrupt is re-enabled by
 :c:func:`enable_irq` the pending bit is checked and if it is set, the interrupt
 is resent either via hardware or by a software resend mechanism. (It's
 necessary to enable CONFIG_HARDIRQS_SW_RESEND when you want to use
 the delayed interrupt disable feature and your hardware is not capable
 of retriggering an interrupt.) The delayed interrupt disable is not
 configurable.
 Chip-level hardware encapsulation
 ---------------------------------
 The chip-level hardware descriptor structure :c:type:`irq_chip` contains all
 the direct chip relevant functions, which can be utilized by the irq flow
 implementations.
 -  ``irq_ack``
 -  ``irq_mask_ack`` - Optional, recommended for performance
 -  ``irq_mask``
 -  ``irq_unmask``
 -  ``irq_eoi`` - Optional, required for EOI flow handlers
 -  ``irq_retrigger`` - Optional
 -  ``irq_set_type`` - Optional
 -  ``irq_set_wake`` - Optional
 These primitives are strictly intended to mean what they say: ack means
 ACK, masking means masking of an IRQ line, etc. It is up to the flow
 handler(s) to use these basic units of low-level functionality.
 __do_IRQ entry point
 ====================
 The original implementation :c:func:`__do_IRQ` was an alternative entry point
 for all types of interrupts. It no longer exists.
 This handler turned out to be not suitable for all interrupt hardware
 and was therefore reimplemented with split functionality for
 edge/level/simple/percpu interrupts. This is not only a functional
 optimization. It also shortens code paths for interrupts.
 Locking on SMP
 ==============
 The locking of chip registers is up to the architecture that defines the
 chip primitives. The per-irq structure is protected via desc->lock, by
 the generic layer.
 Generic interrupt chip
 ======================
 To avoid copies of identical implementations of IRQ chips the core
 provides a configurable generic interrupt chip implementation.
 Developers should check carefully whether the generic chip fits their
 needs before implementing the same functionality slightly differently
 themselves.
 .. kernel-doc:: kernel/irq/generic-chip.c
   :export:
 Structures
 ==========
 This chapter contains the autogenerated documentation of the structures
 which are used in the generic IRQ layer.
 .. kernel-doc:: include/linux/irq.h
   :internal:
 .. kernel-doc:: include/linux/interrupt.h
   :internal:
 Public Functions Provided
 =========================
 This chapter contains the autogenerated documentation of the kernel API
 functions which are exported.
 .. kernel-doc:: kernel/irq/manage.c
 .. kernel-doc:: kernel/irq/chip.c
 Internal Functions Provided
 ===========================
 This chapter contains the autogenerated documentation of the internal
 functions.
 .. kernel-doc:: kernel/irq/irqdesc.c
 .. kernel-doc:: kernel/irq/handle.c
 .. kernel-doc:: kernel/irq/chip.c
 Credits
 =======
 The following people have contributed to this document:
 1. Thomas Gleixner tglx@linutronix.de
 2. Ingo Molnar mingo@elte.hu
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -11,11 +11,14 @@ Core utilities
 .. toctree::
   :maxdepth: 1
   kernel-api
   assoc_array
   atomic_ops
   cpu_hotplug
   local_ops
   workqueue
   genericirq
   flexible-arrays
 Interfaces for kernel debugging
 ===============================
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -0,0 +1,346 @@
 ====================
 The Linux Kernel API
 ====================
 Data Types
 ==========
 Doubly Linked Lists
 -------------------
 .. kernel-doc:: include/linux/list.h
   :internal:
 Basic C Library Functions
 =========================
 When writing drivers, you cannot in general use routines which are from
 the C Library. Some of the functions have been found generally useful
 and they are listed below. The behaviour of these functions may vary
 slightly from those defined by ANSI, and these deviations are noted in
 the text.
 String Conversions
 ------------------
 .. kernel-doc:: lib/vsprintf.c
   :export:
 .. kernel-doc:: include/linux/kernel.h
   :functions: kstrtol
 .. kernel-doc:: include/linux/kernel.h
   :functions: kstrtoul
 .. kernel-doc:: lib/kstrtox.c
   :export:
 String Manipulation
 -------------------
 .. kernel-doc:: lib/string.c
   :export:
 Bit Operations
 --------------
 .. kernel-doc:: arch/x86/include/asm/bitops.h
   :internal:
 Basic Kernel Library Functions
 ==============================
 The Linux kernel provides more basic utility functions.
 Bitmap Operations
 -----------------
 .. kernel-doc:: lib/bitmap.c
   :export:
 .. kernel-doc:: lib/bitmap.c
   :internal:
 Command-line Parsing
 --------------------
 .. kernel-doc:: lib/cmdline.c
   :export:
 CRC Functions
 -------------
 .. kernel-doc:: lib/crc7.c
   :export:
 .. kernel-doc:: lib/crc16.c
   :export:
 .. kernel-doc:: lib/crc-itu-t.c
   :export:
 .. kernel-doc:: lib/crc32.c
 .. kernel-doc:: lib/crc-ccitt.c
   :export:
 idr/ida Functions
 -----------------
 .. kernel-doc:: include/linux/idr.h
   :doc: idr sync
 .. kernel-doc:: lib/idr.c
   :doc: IDA description
 .. kernel-doc:: lib/idr.c
   :export:
 Memory Management in Linux
 ==========================
 The Slab Cache
 --------------
 .. kernel-doc:: include/linux/slab.h
   :internal:
 .. kernel-doc:: mm/slab.c
   :export:
 .. kernel-doc:: mm/util.c
   :export:
 User Space Memory Access
 ------------------------
 .. kernel-doc:: arch/x86/include/asm/uaccess_32.h
   :internal:
 .. kernel-doc:: arch/x86/lib/usercopy_32.c
   :export:
 More Memory Management Functions
 --------------------------------
 .. kernel-doc:: mm/readahead.c
   :export:
 .. kernel-doc:: mm/filemap.c
   :export:
 .. kernel-doc:: mm/memory.c
   :export:
 .. kernel-doc:: mm/vmalloc.c
   :export:
 .. kernel-doc:: mm/page_alloc.c
   :internal:
 .. kernel-doc:: mm/mempool.c
   :export:
 .. kernel-doc:: mm/dmapool.c
   :export:
 .. kernel-doc:: mm/page-writeback.c
   :export:
 .. kernel-doc:: mm/truncate.c
   :export:
 Kernel IPC facilities
 =====================
 IPC utilities
 -------------
 .. kernel-doc:: ipc/util.c
   :internal:
 FIFO Buffer
 ===========
 kfifo interface
 ---------------
 .. kernel-doc:: include/linux/kfifo.h
   :internal:
 relay interface support
 =======================
 Relay interface support is designed to provide an efficient mechanism
 for tools and facilities to relay large amounts of data from kernel
 space to user space.
 relay interface
 ---------------
 .. kernel-doc:: kernel/relay.c
   :export:
 .. kernel-doc:: kernel/relay.c
   :internal:
 Module Support
 ==============
 Module Loading
 --------------
 .. kernel-doc:: kernel/kmod.c
   :export:
 Inter Module support
 --------------------
 Refer to the file kernel/module.c for more information.
 Hardware Interfaces
 ===================
 Interrupt Handling
 ------------------
 .. kernel-doc:: kernel/irq/manage.c
   :export:
 DMA Channels
 ------------
 .. kernel-doc:: kernel/dma.c
   :export:
 Resources Management
 --------------------
 .. kernel-doc:: kernel/resource.c
   :internal:
 .. kernel-doc:: kernel/resource.c
   :export:
 MTRR Handling
 -------------
 .. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c
   :export:
 Security Framework
 ==================
 .. kernel-doc:: security/security.c
   :internal:
 .. kernel-doc:: security/inode.c
   :export:
 Audit Interfaces
 ================
 .. kernel-doc:: kernel/audit.c
   :export:
 .. kernel-doc:: kernel/auditsc.c
   :internal:
 .. kernel-doc:: kernel/auditfilter.c
   :internal:
 Accounting Framework
 ====================
 .. kernel-doc:: kernel/acct.c
   :internal:
 Block Devices
 =============
 .. kernel-doc:: block/blk-core.c
   :export:
 .. kernel-doc:: block/blk-core.c
   :internal:
 .. kernel-doc:: block/blk-map.c
   :export:
 .. kernel-doc:: block/blk-sysfs.c
   :internal:
 .. kernel-doc:: block/blk-settings.c
   :export:
 .. kernel-doc:: block/blk-exec.c
   :export:
 .. kernel-doc:: block/blk-flush.c
   :export:
 .. kernel-doc:: block/blk-lib.c
   :export:
 .. kernel-doc:: block/blk-tag.c
   :export:
 .. kernel-doc:: block/blk-tag.c
   :internal:
 .. kernel-doc:: block/blk-integrity.c
   :export:
 .. kernel-doc:: kernel/trace/blktrace.c
   :internal:
 .. kernel-doc:: block/genhd.c
   :internal:
 .. kernel-doc:: block/genhd.c
   :export:
 Char devices
 ============
 .. kernel-doc:: fs/char_dev.c
   :export:
 Clock Framework
 ===============
 The clock framework defines programming interfaces to support software
 management of the system clock tree. This framework is widely used with
 System-On-Chip (SOC) platforms to support power management and various
 devices which may need custom clock rates. Note that these "clocks"
 don't relate to timekeeping or real time clocks (RTCs), each of which
 have separate frameworks. These :c:type:`struct clk <clk>`
 instances may be used to manage for example a 96 MHz signal that is used
 to shift bits into and out of peripherals or busses, or otherwise
 trigger synchronous state machine transitions in system hardware.
 Power management is supported by explicit software clock gating: unused
 clocks are disabled, so the system doesn't waste power changing the
 state of transistors that aren't in active use. On some systems this may
 be backed by hardware clock gating, where clocks are gated without being
 disabled in software. Sections of chips that are powered but not clocked
 may be able to retain their last state. This low power state is often
 called a *retention mode*. This mode still incurs leakage currents,
 especially with finer circuit geometries, but for CMOS circuits power is
 mostly used by clocked state changes.
 Power-aware drivers only enable their clocks when the device they manage
 is in active use. Also, system sleep states often differ according to
 which clock domains are active: while a "standby" state may allow wakeup
 from several active domains, a "mem" (suspend-to-RAM) state may require
 a more wholesale shutdown of clocks derived from higher speed PLLs and
 oscillators, limiting the number of possible wakeup event sources. A
 driver's suspend method may need to be aware of system-specific clock
 constraints on the target sleep state.
 Some platforms support programmable clock generators. These can be used
 by external chips of various kinds, such as other CPUs, multimedia
 codecs, and devices with strict requirements for interface clocking.
 .. kernel-doc:: include/linux/clk.h
   :internal:
--- a/Documentation/cpu-freq/boost.txt
+++ b/Documentation/cpu-freq/boost.txt
@@ -1,93 +0,0 @@
 Processor boosting control
 	- information for users -
 Quick guide for the impatient:
 --------------------
 /sys/devices/system/cpu/cpufreq/boost
 controls the boost setting for the whole system. You can read and write
 that file with either "0" (boosting disabled) or "1" (boosting allowed).
 Reading or writing 1 does not mean that the system is boosting at this
 very moment, but only that the CPU _may_ raise the frequency at it's
 discretion.
 --------------------
 Introduction
 -------------
 Some CPUs support a functionality to raise the operating frequency of
 some cores in a multi-core package if certain conditions apply, mostly
 if the whole chip is not fully utilized and below it's intended thermal
 budget. The decision about boost disable/enable is made either at hardware
 (e.g. x86) or software (e.g ARM).
 On Intel CPUs this is called "Turbo Boost", AMD calls it "Turbo-Core",
 in technical documentation "Core performance boost". In Linux we use
 the term "boost" for convenience.
 Rationale for disable switch
 ----------------------------
 Though the idea is to just give better performance without any user
 intervention, sometimes the need arises to disable this functionality.
 Most systems offer a switch in the (BIOS) firmware to disable the
 functionality at all, but a more fine-grained and dynamic control would
 be desirable:
 1. While running benchmarks, reproducible results are important. Since
   the boosting functionality depends on the load of the whole package,
   single thread performance can vary. By explicitly disabling the boost
   functionality at least for the benchmark's run-time the system will run
   at a fixed frequency and results are reproducible again.
 2. To examine the impact of the boosting functionality it is helpful
   to do tests with and without boosting.
 3. Boosting means overclocking the processor, though under controlled
   conditions. By raising the frequency and the voltage the processor
   will consume more power than without the boosting, which may be
   undesirable for instance for mobile users. Disabling boosting may
   save power here, though this depends on the workload.
 User controlled switch
 ----------------------
 To allow the user to toggle the boosting functionality, the cpufreq core
 driver exports a sysfs knob to enable or disable it. There is a file:
 /sys/devices/system/cpu/cpufreq/boost
 which can either read "0" (boosting disabled) or "1" (boosting enabled).
 The file is exported only when cpufreq driver supports boosting.
 Explicitly changing the permissions and writing to that file anyway will
 return EINVAL.
 On supported CPUs one can write either a "0" or a "1" into this file.
 This will either disable the boost functionality on all cores in the
 whole system (0) or will allow the software or hardware to boost at will
 (1).
 Writing a "1" does not explicitly boost the system, but just allows the
 CPU to boost at their discretion. Some implementations take external
 factors like the chip's temperature into account, so boosting once does
 not necessarily mean that it will occur every time even using the exact
 same software setup.
 AMD legacy cpb switch
 ---------------------
 The AMD powernow-k8 driver used to support a very similar switch to
 disable or enable the "Core Performance Boost" feature of some AMD CPUs.
 This switch was instantiated in each CPU's cpufreq directory
 (/sys/devices/system/cpu[0-9]*/cpufreq) and was called "cpb".
 Though the per CPU existence hints at a more fine grained control, the
 actual implementation only supported a system-global switch semantics,
 which was simply reflected into each CPU's file. Writing a 0 or 1 into it
 would pull the other CPUs to the same state.
 For compatibility reasons this file and its behavior is still supported
 on AMD CPUs, though it is now protected by a config switch
 (X86_ACPI_CPUFREQ_CPB). On Intel CPUs this file will never be created,
 even with the config option set.
 This functionality is considered legacy and will be removed in some future
 kernel version.
 More fine grained boosting control
 ----------------------------------
 Technically it is possible to switch the boosting functionality at least
 on a per package basis, for some CPUs even per core. Currently the driver
 does not support it, but this may be implemented in the future.
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -231,7 +231,7 @@ the reference implementation in drivers/cpufreq/longrun.c
 Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset.
 get_intermediate should return a stable intermediate frequency platform wants to
-switch to, and target_intermediate() should set CPU to to that frequency, before
+switch to, and target_intermediate() should set CPU to that frequency, before
 jumping to the frequency corresponding to 'index'. Core will take care of
 sending notifications and driver doesn't have to handle them in
 target_intermediate() or target_index().
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -1,301 +0,0 @@
     CPU frequency and voltage scaling code in the Linux(TM) kernel
 		         L i n u x    C P U F r e q
 		      C P U F r e q   G o v e r n o r s
 		   - information for users and developers -
 		    Dominik Brodowski  <linux@brodo.de>
            some additions and corrections by Nico Golde <nico@ngolde.de>
 		Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 		   Viresh Kumar <viresh.kumar@linaro.org>
   Clock scaling allows you to change the clock speed of the CPUs on the
    fly. This is a nice method to save battery power, because the lower
            the clock speed, the less power the CPU consumes.
 Contents:
 ---------
 1.   What is a CPUFreq Governor?
 2.   Governors In the Linux Kernel
 2.1  Performance
 2.2  Powersave
 2.3  Userspace
 2.4  Ondemand
 2.5  Conservative
 2.6  Schedutil
 3.   The Governor Interface in the CPUfreq Core
 4.   References
 1. What Is A CPUFreq Governor?
 ==============================
 Most cpufreq drivers (except the intel_pstate and longrun) or even most
 cpu frequency scaling algorithms only allow the CPU frequency to be set
 to predefined fixed values.  In order to offer dynamic frequency
 scaling, the cpufreq core must be able to tell these drivers of a
 "target frequency". So these specific drivers will be transformed to
 offer a "->target/target_index/fast_switch()" call instead of the
 "->setpolicy()" call. For set_policy drivers, all stays the same,
 though.
 How to decide what frequency within the CPUfreq policy should be used?
 That's done using "cpufreq governors".
 Basically, it's the following flow graph:
 CPU can be set to switch independently	 |	   CPU can only be set
      within specific "limits"		 |       to specific frequencies
                                 "CPUfreq policy"
 		consists of frequency limits (policy->{min,max})
  		     and CPUfreq governor to be used
 			 /		      \
 			/		       \
 		       /		       the cpufreq governor decides
 		      /			       (dynamically or statically)
 		     /			       what target_freq to set within
 		    /			       the limits of policy->{min,max}
 		   /			            \
 		  /				     \
 	Using the ->setpolicy call,		 Using the ->target/target_index/fast_switch call,
 	    the limits and the			  the frequency closest
 	     "policy" is set.			  to target_freq is set.
 						  It is assured that it
 						  is within policy->{min,max}
 2. Governors In the Linux Kernel
 ================================
 2.1 Performance
 ---------------
 The CPUfreq governor "performance" sets the CPU statically to the
 highest frequency within the borders of scaling_min_freq and
 scaling_max_freq.
 2.2 Powersave
 -------------
 The CPUfreq governor "powersave" sets the CPU statically to the
 lowest frequency within the borders of scaling_min_freq and
 scaling_max_freq.
 2.3 Userspace
 -------------
 The CPUfreq governor "userspace" allows the user, or any userspace
 program running with UID "root", to set the CPU to a specific frequency
 by making a sysfs file "scaling_setspeed" available in the CPU-device
 directory.
 2.4 Ondemand
 ------------
 The CPUfreq governor "ondemand" sets the CPU frequency depending on the
 current system load. Load estimation is triggered by the scheduler
 through the update_util_data->func hook; when triggered, cpufreq checks
 the CPU-usage statistics over the last period and the governor sets the
 CPU accordingly.  The CPU must have the capability to switch the
 frequency very quickly.
 Sysfs files:
 * sampling_rate:
  Measured in uS (10^-6 seconds), this is how often you want the kernel
  to look at the CPU usage and to make decisions on what to do about the
  frequency.  Typically this is set to values of around '10000' or more.
  It's default value is (cmp. with users-guide.txt): transition_latency
  * 1000.  Be aware that transition latency is in ns and sampling_rate
  is in us, so you get the same sysfs value by default.  Sampling rate
  should always get adjusted considering the transition latency to set
  the sampling rate 750 times as high as the transition latency in the
  bash (as said, 1000 is default), do:
  $ echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
 * sampling_rate_min:
  The sampling rate is limited by the HW transition latency:
  transition_latency * 100
  Or by kernel restrictions:
  - If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed.
  - If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is
    used, the limits depend on the CONFIG_HZ option:
    HZ=1000: min=20000us  (20ms)
    HZ=250:  min=80000us  (80ms)
    HZ=100:  min=200000us (200ms)
  The highest value of kernel and HW latency restrictions is shown and
  used as the minimum sampling rate.
 * up_threshold:
  This defines what the average CPU usage between the samplings of
  'sampling_rate' needs to be for the kernel to make a decision on
  whether it should increase the frequency.  For example when it is set
  to its default value of '95' it means that between the checking
  intervals the CPU needs to be on average more than 95% in use to then
  decide that the CPU frequency needs to be increased.
 * ignore_nice_load:
  This parameter takes a value of '0' or '1'. When set to '0' (its
  default), all processes are counted towards the 'cpu utilisation'
  value.  When set to '1', the processes that are run with a 'nice'
  value will not count (and thus be ignored) in the overall usage
  calculation.  This is useful if you are running a CPU intensive
  calculation on your laptop that you do not care how long it takes to
  complete as you can 'nice' it and prevent it from taking part in the
  deciding process of whether to increase your CPU frequency.
 * sampling_down_factor:
  This parameter controls the rate at which the kernel makes a decision
  on when to decrease the frequency while running at top speed. When set
  to 1 (the default) decisions to reevaluate load are made at the same
  interval regardless of current clock speed. But when set to greater
  than 1 (e.g. 100) it acts as a multiplier for the scheduling interval
  for reevaluating load when the CPU is at its top speed due to high
  load. This improves performance by reducing the overhead of load
  evaluation and helping the CPU stay at its top speed when truly busy,
  rather than shifting back and forth in speed. This tunable has no
  effect on behavior at lower speeds/lower CPU loads.
 * powersave_bias:
  This parameter takes a value between 0 to 1000. It defines the
  percentage (times 10) value of the target frequency that will be
  shaved off of the target. For example, when set to 100 -- 10%, when
  ondemand governor would have targeted 1000 MHz, it will target
  1000 MHz - (10% of 1000 MHz) = 900 MHz instead. This is set to 0
  (disabled) by default.
  When AMD frequency sensitivity powersave bias driver --
  drivers/cpufreq/amd_freq_sensitivity.c is loaded, this parameter
  defines the workload frequency sensitivity threshold in which a lower
  frequency is chosen instead of ondemand governor's original target.
  The frequency sensitivity is a hardware reported (on AMD Family 16h
  Processors and above) value between 0 to 100% that tells software how
  the performance of the workload running on a CPU will change when
  frequency changes. A workload with sensitivity of 0% (memory/IO-bound)
  will not perform any better on higher core frequency, whereas a
  workload with sensitivity of 100% (CPU-bound) will perform better
  higher the frequency. When the driver is loaded, this is set to 400 by
  default -- for CPUs running workloads with sensitivity value below
  40%, a lower frequency is chosen. Unloading the driver or writing 0
  will disable this feature.
 2.5 Conservative
 ----------------
 The CPUfreq governor "conservative", much like the "ondemand"
 governor, sets the CPU frequency depending on the current usage.  It
 differs in behaviour in that it gracefully increases and decreases the
 CPU speed rather than jumping to max speed the moment there is any load
 on the CPU. This behaviour is more suitable in a battery powered
 environment.  The governor is tweaked in the same manner as the
 "ondemand" governor through sysfs with the addition of:
 * freq_step:
  This describes what percentage steps the cpu freq should be increased
  and decreased smoothly by.  By default the cpu frequency will increase
  in 5% chunks of your maximum cpu frequency.  You can change this value
  to anywhere between 0 and 100 where '0' will effectively lock your CPU
  at a speed regardless of its load whilst '100' will, in theory, make
  it behave identically to the "ondemand" governor.
 * down_threshold:
  Same as the 'up_threshold' found for the "ondemand" governor but for
  the opposite direction.  For example when set to its default value of
  '20' it means that if the CPU usage needs to be below 20% between
  samples to have the frequency decreased.
 * sampling_down_factor:
  Similar functionality as in "ondemand" governor.  But in
  "conservative", it controls the rate at which the kernel makes a
  decision on when to decrease the frequency while running in any speed.
  Load for frequency increase is still evaluated every sampling rate.
 2.6 Schedutil
 -------------
 The "schedutil" governor aims at better integration with the Linux
 kernel scheduler.  Load estimation is achieved through the scheduler's
 Per-Entity Load Tracking (PELT) mechanism, which also provides
 information about the recent load [1].  This governor currently does
 load based DVFS only for tasks managed by CFS. RT and DL scheduler tasks
 are always run at the highest frequency.  Unlike all the other
 governors, the code is located under the kernel/sched/ directory.
 Sysfs files:
 * rate_limit_us:
  This contains a value in microseconds. The governor waits for
  rate_limit_us time before reevaluating the load again, after it has
  evaluated the load once.
 For an in-depth comparison with the other governors refer to [2].
 3. The Governor Interface in the CPUfreq Core
 =============================================
 A new governor must register itself with the CPUfreq core using
 "cpufreq_register_governor". The struct cpufreq_governor, which has to
 be passed to that function, must contain the following values:
 governor->name - A unique name for this governor.
 governor->owner - .THIS_MODULE for the governor module (if appropriate).
 plus a set of hooks to the functions implementing the governor's logic.
 The CPUfreq governor may call the CPU processor driver using one of
 these two functions:
 int cpufreq_driver_target(struct cpufreq_policy *policy,
                                 unsigned int target_freq,
                                 unsigned int relation);
 int __cpufreq_driver_target(struct cpufreq_policy *policy,
                                   unsigned int target_freq,
                                   unsigned int relation);
 target_freq must be within policy->min and policy->max, of course.
 What's the difference between these two functions? When your governor is
 in a direct code path of a call to governor callbacks, like
 governor->start(), the policy->rwsem is still held in the cpufreq core,
 and there's no need to lock it again (in fact, this would cause a
 deadlock). So use __cpufreq_driver_target only in these cases. In all
 other cases (for example, when there's a "daemonized" function that
 wakes up every second), use cpufreq_driver_target to take policy->rwsem
 before the command is passed to the cpufreq driver.
 4. References
 =============
 [1] Per-entity load tracking: https://lwn.net/Articles/531853/
 [2] Improvements in CPU frequency management: https://lwn.net/Articles/682391/
--- a/Documentation/cpu-freq/index.txt
+++ b/Documentation/cpu-freq/index.txt
@@ -21,8 +21,6 @@ Documents in this directory:
 amd-powernow.txt -	AMD powernow driver specific file.
 boost.txt -		Frequency boosting support.
 core.txt	-	General description of the CPUFreq core and
 			of CPUFreq notifiers.
@@ -32,17 +30,12 @@ cpufreq-nforce2.txt -	nVidia nForce2 platform specific file.
 cpufreq-stats.txt -	General description of sysfs cpufreq stats.
 governors.txt	-	What are cpufreq governors and how to
 			implement them?
 index.txt	-	File index, Mailing list and Links (this document)
 intel-pstate.txt -	Intel pstate cpufreq driver specific file.
 pcc-cpufreq.txt -	PCC cpufreq driver specific file.
 user-guide.txt	-	User Guide to CPUFreq
 Mailing List
 ------------
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -1,281 +0,0 @@
 Intel P-State driver
 --------------------
 This driver provides an interface to control the P-State selection for the
 SandyBridge+ Intel processors.
 The following document explains P-States:
 http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
 As stated in the document, P-State doesn’t exactly mean a frequency. However, for
 the sake of the relationship with cpufreq, P-State and frequency are used
 interchangeably.
 Understanding the cpufreq core governors and policies are important before
 discussing more details about the Intel P-State driver. Based on what callbacks
 a cpufreq driver provides to the cpufreq core, it can support two types of
 drivers:
 - with target_index() callback: In this mode, the drivers using cpufreq core
 simply provide the minimum and maximum frequency limits and an additional
 interface target_index() to set the current frequency. The cpufreq subsystem
 has a number of scaling governors ("performance", "powersave", "ondemand",
 etc.). Depending on which governor is in use, cpufreq core will call for
 transitions to a specific frequency using target_index() callback.
 - setpolicy() callback: In this mode, drivers do not provide target_index()
 callback, so cpufreq core can't request a transition to a specific frequency.
 The driver provides minimum and maximum frequency limits and callbacks to set a
 policy. The policy in cpufreq sysfs is referred to as the "scaling governor".
 The cpufreq core can request the driver to operate in any of the two policies:
 "performance" and "powersave". The driver decides which frequency to use based
 on the above policy selection considering minimum and maximum frequency limits.
 The Intel P-State driver falls under the latter category, which implements the
 setpolicy() callback. This driver decides what P-State to use based on the
 requested policy from the cpufreq core. If the processor is capable of
 selecting its next P-State internally, then the driver will offload this
 responsibility to the processor (aka HWP: Hardware P-States). If not, the
 driver implements algorithms to select the next P-State.
 Since these policies are implemented in the driver, they are not same as the
 cpufreq scaling governors implementation, even if they have the same name in
 the cpufreq sysfs (scaling_governors). For example the "performance" policy is
 similar to cpufreq’s "performance" governor, but "powersave" is completely
 different than the cpufreq "powersave" governor. The strategy here is similar
 to cpufreq "ondemand", where the requested P-State is related to the system load.
 Sysfs Interface
 In addition to the frequency-controlling interfaces provided by the cpufreq
 core, the driver provides its own sysfs files to control the P-State selection.
 These files have been added to /sys/devices/system/cpu/intel_pstate/.
 Any changes made to these files are applicable to all CPUs (even in a
 multi-package system, Refer to later section on placing "Per-CPU limits").
      max_perf_pct: Limits the maximum P-State that will be requested by
      the driver. It states it as a percentage of the available performance. The
      available (P-State) performance may be reduced by the no_turbo
      setting described below.
      min_perf_pct: Limits the minimum P-State that will be requested by
      the driver. It states it as a percentage of the max (non-turbo)
      performance level.
      no_turbo: Limits the driver to selecting P-State below the turbo
      frequency range.
      turbo_pct: Displays the percentage of the total performance that
      is supported by hardware that is in the turbo range. This number
      is independent of whether turbo has been disabled or not.
      num_pstates: Displays the number of P-States that are supported
      by hardware. This number is independent of whether turbo has
      been disabled or not.
 For example, if a system has these parameters:
 	Max 1 core turbo ratio: 0x21 (Max 1 core ratio is the maximum P-State)
 	Max non turbo ratio: 0x17
 	Minimum ratio : 0x08 (Here the ratio is called max efficiency ratio)
 Sysfs will show :
 	max_perf_pct:100, which corresponds to 1 core ratio
 	min_perf_pct:24, max_efficiency_ratio / max 1 Core ratio
 	no_turbo:0, turbo is not disabled
 	num_pstates:26 = (max 1 Core ratio - Max Efficiency Ratio + 1)
 	turbo_pct:39 = (max 1 core ratio - max non turbo ratio) / num_pstates
 Refer to "Intel® 64 and IA-32 Architectures Software Developer’s Manual
 Volume 3: System Programming Guide" to understand ratios.
 There is one more sysfs attribute in /sys/devices/system/cpu/intel_pstate/
 that can be used for controlling the operation mode of the driver:
      status: Three settings are possible:
      "off"     - The driver is not in use at this time.
      "active"  - The driver works as a P-state governor (default).
      "passive" - The driver works as a regular cpufreq one and collaborates
                  with the generic cpufreq governors (it sets P-states as
                  requested by those governors).
      The current setting is returned by reads from this attribute.  Writing one
      of the above strings to it changes the operation mode as indicated by that
      string, if possible.  If HW-managed P-states (HWP) are enabled, it is not
      possible to change the driver's operation mode and attempts to write to
      this attribute will fail.
 cpufreq sysfs for Intel P-State
 Since this driver registers with cpufreq, cpufreq sysfs is also presented.
 There are some important differences, which need to be considered.
 scaling_cur_freq: This displays the real frequency which was used during
 the last sample period instead of what is requested. Some other cpufreq driver,
 like acpi-cpufreq, displays what is requested (Some changes are on the
 way to fix this for acpi-cpufreq driver). The same is true for frequencies
 displayed at /proc/cpuinfo.
 scaling_governor: This displays current active policy. Since each CPU has a
 cpufreq sysfs, it is possible to set a scaling governor to each CPU. But this
 is not possible with Intel P-States, as there is one common policy for all
 CPUs. Here, the last requested policy will be applicable to all CPUs. It is
 suggested that one use the cpupower utility to change policy to all CPUs at the
 same time.
 scaling_setspeed: This attribute can never be used with Intel P-State.
 scaling_max_freq/scaling_min_freq: This interface can be used similarly to
 the max_perf_pct/min_perf_pct of Intel P-State sysfs. However since frequencies
 are converted to nearest possible P-State, this is prone to rounding errors.
 This method is not preferred to limit performance.
 affected_cpus: Not used
 related_cpus: Not used
 For contemporary Intel processors, the frequency is controlled by the
 processor itself and the P-State exposed to software is related to
 performance levels.  The idea that frequency can be set to a single
 frequency is fictional for Intel Core processors. Even if the scaling
 driver selects a single P-State, the actual frequency the processor
 will run at is selected by the processor itself.
 Per-CPU limits
 The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
 the intel_pstate driver to use per-CPU performance limits.  When it is set,
 the sysfs control interface described above is subject to limitations.
 - The following controls are not available for both read and write
 	/sys/devices/system/cpu/intel_pstate/max_perf_pct
 	/sys/devices/system/cpu/intel_pstate/min_perf_pct
 - The following controls can be used to set performance limits, as far as the
 architecture of the processor permits:
 	/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
 	/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
 	/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
 - User can still observe turbo percent and number of P-States from
 	/sys/devices/system/cpu/intel_pstate/turbo_pct
 	/sys/devices/system/cpu/intel_pstate/num_pstates
 - User can read write system wide turbo status
 	/sys/devices/system/cpu/no_turbo
 Support of energy performance hints
 It is possible to provide hints to the HWP algorithms in the processor
 to be more performance centric to more energy centric. When the driver
 is using HWP, two additional cpufreq sysfs attributes are presented for
 each logical CPU.
 These attributes are:
 	- energy_performance_available_preferences
 	- energy_performance_preference
 To get list of supported hints:
 $ cat energy_performance_available_preferences
    default performance balance_performance balance_power power
 The current preference can be read or changed via cpufreq sysfs
 attribute "energy_performance_preference". Reading from this attribute
 will display current effective setting. User can write any of the valid
 preference string to this attribute. User can always restore to power-on
 default by writing "default".
 Since threads can migrate to different CPUs, this is possible that the
 new CPU may have different energy performance preference than the previous
 one. To avoid such issues, either threads can be pinned to specific CPUs
 or set the same energy performance preference value to all CPUs.
 Tuning Intel P-State driver
 When the performance can be tuned using PID (Proportional Integral
 Derivative) controller, debugfs files are provided for adjusting performance.
 They are presented under:
 /sys/kernel/debug/pstate_snb/
 The PID tunable parameters are:
      deadband
      d_gain_pct
      i_gain_pct
      p_gain_pct
      sample_rate_ms
      setpoint
 To adjust these parameters, some understanding of driver implementation is
 necessary. There are some tweeks described here, but be very careful. Adjusting
 them requires expert level understanding of power and performance relationship.
 These limits are only useful when the "powersave" policy is active.
 -To make the system more responsive to load changes, sample_rate_ms can
 be adjusted  (current default is 10ms).
 -To make the system use higher performance, even if the load is lower, setpoint
 can be adjusted to a lower number. This will also lead to faster ramp up time
 to reach the maximum P-State.
 If there are no derivative and integral coefficients, The next P-State will be
 equal to:
 	current P-State - ((setpoint - current cpu load) * p_gain_pct)
 For example, if the current PID parameters are (Which are defaults for the core
 processors like SandyBridge):
      deadband = 0
      d_gain_pct = 0
      i_gain_pct = 0
      p_gain_pct = 20
      sample_rate_ms = 10
      setpoint = 97
 If the current P-State = 0x08 and current load = 100, this will result in the
 next P-State = 0x08 - ((97 - 100) * 0.2) = 8.6 (rounded to 9). Here the P-State
 goes up by only 1. If during next sample interval the current load doesn't
 change and still 100, then P-State goes up by one again. This process will
 continue as long as the load is more than the setpoint until the maximum P-State
 is reached.
 For the same load at setpoint = 60, this will result in the next P-State
 = 0x08 - ((60 - 100) * 0.2) = 16
 So by changing the setpoint from 97 to 60, there is an increase of the
 next P-State from 9 to 16. So this will make processor execute at higher
 P-State for the same CPU load. If the load continues to be more than the
 setpoint during next sample intervals, then P-State will go up again till the
 maximum P-State is reached. But the ramp up time to reach the maximum P-State
 will be much faster when the setpoint is 60 compared to 97.
 Debugging Intel P-State driver
 Event tracing
 To debug P-State transition, the Linux event tracing interface can be used.
 There are two specific events, which can be enabled (Provided the kernel
 configs related to event tracing are enabled).
 # cd /sys/kernel/debug/tracing/
 # echo 1 > events/power/pstate_sample/enable
 # echo 1 > events/power/cpu_frequency/enable
 # cat trace
 gnome-terminal--4510  [001] ..s.  1177.680733: pstate_sample: core_busy=107
 	scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618
 		freq=2474476
 cat-5235  [002] ..s.  1177.681723: cpu_frequency: state=2900000 cpu_id=2
 Using ftrace
 If function level tracing is required, the Linux ftrace interface can be used.
 For example if we want to check how often a function to set a P-State is
 called, we can set ftrace filter to intel_pstate_set_pstate.
 # cd /sys/kernel/debug/tracing/
 # cat available_filter_functions | grep -i pstate
 intel_pstate_set_pstate
 intel_pstate_cpu_init
 ...
 # echo intel_pstate_set_pstate > set_ftrace_filter
 # echo function > current_tracer
 # cat trace | head -15
 # tracer: function
 #
 # entries-in-buffer/entries-written: 80/80   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
            Xorg-3129  [000] ..s.  2537.644844: intel_pstate_set_pstate <-intel_pstate_timer_func
 gnome-terminal--4510  [002] ..s.  2537.649844: intel_pstate_set_pstate <-intel_pstate_timer_func
     gnome-shell-3409  [001] ..s.  2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
          <idle>-0     [000] ..s.  2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -1,228 +0,0 @@
     CPU frequency and voltage scaling code in the Linux(TM) kernel
 		         L i n u x    C P U F r e q
 			     U S E R   G U I D E
 		    Dominik Brodowski  <linux@brodo.de>
   Clock scaling allows you to change the clock speed of the CPUs on the
    fly. This is a nice method to save battery power, because the lower
            the clock speed, the less power the CPU consumes.
 Contents:
 ---------
 1. Supported Architectures and Processors
 1.1 ARM and ARM64
 1.2 x86
 1.3 sparc64
 1.4 ppc
 1.5 SuperH
 1.6 Blackfin
 2. "Policy" / "Governor"?
 2.1 Policy
 2.2 Governor
 3. How to change the CPU cpufreq policy and/or speed
 3.1 Preferred interface: sysfs
 1. Supported Architectures and Processors
 =========================================
 1.1 ARM and ARM64
 -----------------
 Almost all ARM and ARM64 platforms support CPU frequency scaling.
 1.2 x86
 -------
 The following processors for the x86 architecture are supported by cpufreq:
 AMD Elan - SC400, SC410
 AMD mobile K6-2+
 AMD mobile K6-3+
 AMD mobile Duron
 AMD mobile Athlon
 AMD Opteron
 AMD Athlon 64
 Cyrix Media GXm
 Intel mobile PIII and Intel mobile PIII-M on certain chipsets
 Intel Pentium 4, Intel Xeon
 Intel Pentium M (Centrino)
 National Semiconductors Geode GX
 Transmeta Crusoe
 Transmeta Efficeon
 VIA Cyrix 3 / C3
 various processors on some ACPI 2.0-compatible systems [*]
 And many more
 [*] Only if "ACPI Processor Performance States" are available
 to the ACPI<->BIOS interface.
 1.3 sparc64
 -----------
 The following processors for the sparc64 architecture are supported by
 cpufreq:
 UltraSPARC-III
 1.4 ppc
 -------
 Several "PowerBook" and "iBook2" notebooks are supported.
 The following POWER processors are supported in powernv mode:
 POWER8
 POWER9
 1.5 SuperH
 ----------
 All SuperH processors supporting rate rounding through the clock
 framework are supported by cpufreq.
 1.6 Blackfin
 ------------
 The following Blackfin processors are supported by cpufreq:
 BF522, BF523, BF524, BF525, BF526, BF527, Rev 0.1 or higher
 BF531, BF532, BF533, Rev 0.3 or higher
 BF534, BF536, BF537, Rev 0.2 or higher
 BF561, Rev 0.3 or higher
 BF542, BF544, BF547, BF548, BF549, Rev 0.1 or higher
 2. "Policy" / "Governor" ?
 ==========================
 Some CPU frequency scaling-capable processor switch between various
 frequencies and operating voltages "on the fly" without any kernel or
 user involvement. This guarantees very fast switching to a frequency
 which is high enough to serve the user's needs, but low enough to save
 power.
 2.1 Policy
 ----------
 On these systems, all you can do is select the lower and upper
 frequency limit as well as whether you want more aggressive
 power-saving or more instantly available processing power.
 2.2 Governor
 ------------
 On all other cpufreq implementations, these boundaries still need to
 be set. Then, a "governor" must be selected. Such a "governor" decides
 what speed the processor shall run within the boundaries. One such
 "governor" is the "userspace" governor. This one allows the user - or
 a yet-to-implement userspace program - to decide what specific speed
 the processor shall run at.
 3. How to change the CPU cpufreq policy and/or speed
 ====================================================
 3.1 Preferred Interface: sysfs
 ------------------------------
 The preferred interface is located in the sysfs filesystem. If you
 mounted it at /sys, the cpufreq interface is located in a subdirectory
 "cpufreq" within the cpu-device directory
 (e.g. /sys/devices/system/cpu/cpu0/cpufreq/ for the first CPU).
 affected_cpus :			List of Online CPUs that require software
 				coordination of frequency.
 cpuinfo_cur_freq :		Current frequency of the CPU as obtained from
 				the hardware, in KHz. This is the frequency
 				the CPU actually runs at.
 cpuinfo_min_freq :		this file shows the minimum operating
 				frequency the processor can run at(in kHz) 
 cpuinfo_max_freq :		this file shows the maximum operating
 				frequency the processor can run at(in kHz) 
 cpuinfo_transition_latency	The time it takes on this CPU to
 				switch between two frequencies in nano
 				seconds. If unknown or known to be
 				that high that the driver does not
 				work with the ondemand governor, -1
 				(CPUFREQ_ETERNAL) will be returned.
 				Using this information can be useful
 				to choose an appropriate polling
 				frequency for a kernel governor or
 				userspace daemon. Make sure to not
 				switch the frequency too often
 				resulting in performance loss.
 related_cpus :			List of Online + Offline CPUs that need software
 				coordination of frequency.
 scaling_available_frequencies : List of available frequencies, in KHz.
 scaling_available_governors :	this file shows the CPUfreq governors
 				available in this kernel. You can see the
 				currently activated governor in
 scaling_cur_freq :		Current frequency of the CPU as determined by
 				the governor and cpufreq core, in KHz. This is
 				the frequency the kernel thinks the CPU runs
 				at.
 scaling_driver :		this file shows what cpufreq driver is
 				used to set the frequency on this CPU
 scaling_governor,		and by "echoing" the name of another
 				governor you can change it. Please note
 				that some governors won't load - they only
 				work on some specific architectures or
 				processors.
 scaling_min_freq and
 scaling_max_freq		show the current "policy limits" (in
 				kHz). By echoing new values into these
 				files, you can change these limits.
 				NOTE: when setting a policy you need to
 				first set scaling_max_freq, then
 				scaling_min_freq.
 scaling_setspeed		This can be read to get the currently programmed
 				value by the governor. This can be written to
 				change the current frequency for a group of
 				CPUs, represented by a policy. This is supported
 				currently only by the userspace governor.
 bios_limit :			If the BIOS tells the OS to limit a CPU to
 				lower frequencies, the user can read out the
 				maximum available frequency from this file.
 				This typically can happen through (often not
 				intended) BIOS settings, restrictions
 				triggered through a service processor or other
 				BIOS/HW based implementations.
 				This does not cover thermal ACPI limitations
 				which can be detected through the generic
 				thermal driver.
 If you have selected the "userspace" governor which allows you to
 set the CPU operating frequency to a specific value, you can read out
 the current frequency in
 scaling_setspeed.		By "echoing" a new frequency into this
 				you can change the speed of the CPU,
 				but only within the limits of
 				scaling_min_freq and scaling_max_freq.
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -100,7 +100,7 @@ not defined by include/asm-XXX/topology.h:
 For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
 default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawes (CONFIG_SCHED_DRAWER) there are
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
 no default definitions for topology_drawer_id() and topology_drawer_cpumask().
 Additionally, CPU topology information is provided under
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH
        char ctx[];
    };
-    static struct sdescinit_sdesc(struct crypto_shash *alg)
+    static struct sdesc init_sdesc(struct crypto_shash *alg)
    {
-        struct sdescsdesc;
+        struct sdesc sdesc;
        int size;
        size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
@@ -172,7 +172,7 @@ Code Example For Use of Operational State Memory With SHASH
    static int calc_hash(struct crypto_shashalg,
                 const unsigned chardata, unsigned int datalen,
                 unsigned chardigest) {
-        struct sdescsdesc;
+        struct sdesc sdesc;
        int ret;
        sdesc = init_sdesc(alg);
--- a/Documentation/crypto/asymmetric-keys.txt
+++ b/Documentation/crypto/asymmetric-keys.txt
@@ -311,3 +311,54 @@ Functions are provided to register and unregister parsers:
 Parsers may not have the same name.  The names are otherwise only used for
 displaying in debugging messages.
 =========================
 KEYRING LINK RESTRICTIONS
 =========================
 Keyrings created from userspace using add_key can be configured to check the
 signature of the key being linked.
 Several restriction methods are available:
 (1) Restrict using the kernel builtin trusted keyring
     - Option string used with KEYCTL_RESTRICT_KEYRING:
       - "builtin_trusted"
     The kernel builtin trusted keyring will be searched for the signing
     key. The ca_keys kernel parameter also affects which keys are used for
     signature verification.
 (2) Restrict using the kernel builtin and secondary trusted keyrings
     - Option string used with KEYCTL_RESTRICT_KEYRING:
       - "builtin_and_secondary_trusted"
     The kernel builtin and secondary trusted keyrings will be searched for the
     signing key. The ca_keys kernel parameter also affects which keys are used
     for signature verification.
 (3) Restrict using a separate key or keyring
     - Option string used with KEYCTL_RESTRICT_KEYRING:
       - "key_or_keyring:<key or keyring serial number>[:chain]"
     Whenever a key link is requested, the link will only succeed if the key
     being linked is signed by one of the designated keys. This key may be
     specified directly by providing a serial number for one asymmetric key, or
     a group of keys may be searched for the signing key by providing the
     serial number for a keyring.
     When the "chain" option is provided at the end of the string, the keys
     within the destination keyring will also be searched for signing keys.
     This allows for verification of certificate chains by adding each
     cert in order (starting closest to the root) to one keyring.
 In all of these cases, if the signing key is found the signature of the key to
 be linked will be verified using the signing key.  The requested key is added
 to the keyring only if the signature is successfully verified.  -ENOKEY is
 returned if the parent certificate could not be found, or -EKEYREJECTED is
 returned if the signature check fails or the key is blacklisted.  Other errors
 may be returned if the signature check could not be performed.
--- a/Documentation/debugging-via-ohci1394.txt
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -100,8 +100,8 @@ Step-by-step instructions for using firescope with early OHCI initialization:
   CardBus and even some Express cards which are fully compliant to OHCI-1394
   specification are available. If it requires no driver for Windows operating
   systems, it most likely is. Only specialized shops have cards which are not
-   compliant, they are based on TI PCILynx chips and require drivers for Win-
+   compliant, they are based on TI PCILynx chips and require drivers for Windows
-   dows operating systems.
+   operating systems.
   The mentioned kernel log message contains the string "physUB" if the
   controller implements a writable Physical Upper Bound register.  This is
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -290,7 +290,7 @@ message, which takes an arbitrary number of cblock ranges.  Each cblock
 range's end value is "one past the end", meaning 5-10 expresses a range
 of values from 5 to 9.  Each cblock must be expressed as a decimal
 value, in the future a variant message that takes cblock ranges
-expressed in hexidecimal may be needed to better support efficient
+expressed in hexadecimal may be needed to better support efficient
 invalidation of larger caches.  The cache must be in passthrough mode
 when invalidate_cblocks is used.
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
 <cipher>
-    Encryption cipher and an optional IV generation mode.
+    Encryption cipher, encryption mode and Initial Vector (IV) generator.
    (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
    Examples:
       des
       aes-cbc-essiv:sha256
       twofish-ecb
-    /proc/crypto contains supported crypto modes
+    The cipher specifications format is:
       cipher[:keycount]-chainmode-ivmode[:ivopts]
    Examples:
       aes-cbc-essiv:sha256
       aes-xts-plain64
       serpent-xts-plain64
    Cipher format also supports direct specification with kernel crypt API
    format (selected by capi: prefix). The IV specification is the same
    as for the first format type.
    This format is mainly used for specification of authenticated modes.
    The crypto API cipher specifications format is:
        capi:cipher_api_spec-ivmode[:ivopts]
    Examples:
        capi:cbc(aes)-essiv:sha256
        capi:xts(aes)-plain64
    Examples of authenticated modes:
        capi:gcm(aes)-random
        capi:authenc(hmac(sha256),xts(aes))-random
        capi:rfc7539(chacha20,poly1305)-random
    The /proc/crypto contains a list of curently loaded crypto modes.
 <key>
    Key used for encryption. It is encoded either as a hexadecimal number
@@ -93,6 +110,32 @@ submit_from_crypt_cpus
    thread because it benefits CFQ to have writes submitted using the
    same context.
 integrity:<bytes>:<type>
    The device requires additional <bytes> metadata per-sector stored
    in per-bio integrity structure. This metadata must by provided
    by underlying dm-integrity target.
    The <type> can be "none" if metadata is used only for persistent IV.
    For Authenticated Encryption with Additional Data (AEAD)
    the <type> is "aead". An AEAD mode additionally calculates and verifies
    integrity for the encrypted device. The additional space is then
    used for storing authentication tag (and persistent IV if needed).
 sector_size:<bytes>
    Use <bytes> as the encryption unit instead of 512 bytes sectors.
    This option can be in range 512 - 4096 bytes and must be power of two.
    Virtual device will announce this size as a minimal IO and logical sector.
 iv_large_sectors
   IV generators will use sector number counted in <sector_size> units
   instead of default 512 bytes sectors.
   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
   if this flag is specified.
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,199 @@
 The dm-integrity target emulates a block device that has additional
 per-sector tags that can be used for storing integrity information.
 A general problem with storing integrity tags with every sector is that
 writing the sector and the integrity tag must be atomic - i.e. in case of
 crash, either both sector and integrity tag or none of them is written.
 To guarantee write atomicity, the dm-integrity target uses journal, it
 writes sector data and integrity tags into a journal, commits the journal
 and then copies the data and integrity tags to their respective location.
 The dm-integrity target can be used with the dm-crypt target - in this
 situation the dm-crypt target creates the integrity data and passes them
 to the dm-integrity target via bio_integrity_payload attached to the bio.
 In this mode, the dm-crypt and dm-integrity targets provide authenticated
 disk encryption - if the attacker modifies the encrypted device, an I/O
 error is returned instead of random data.
 The dm-integrity target can also be used as a standalone target, in this
 mode it calculates and verifies the integrity tag internally. In this
 mode, the dm-integrity target can be used to detect silent data
 corruption on the disk or in the I/O path.
 When loading the target for the first time, the kernel driver will format
 the device. But it will only format the device if the superblock contains
 zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
 target can't be loaded.
 To use the target for the first time:
 1. overwrite the superblock with zeroes
 2. load the dm-integrity target with one-sector size, the kernel driver
 	will format the device
 3. unload the dm-integrity target
 4. read the "provided_data_sectors" value from the superblock
 5. load the dm-integrity target with the the target size
 	"provided_data_sectors"
 6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
 	with the size "provided_data_sectors"
 Target arguments:
 1. the underlying block device
 2. the number of reserved sector at the beginning of the device - the
 	dm-integrity won't read of write these sectors
 3. the size of the integrity tag (if "-" is used, the size is taken from
 	the internal-hash algorithm)
 4. mode:
 	D - direct writes (without journal) - in this mode, journaling is
 		not used and data sectors and integrity tags are written
 		separately. In case of crash, it is possible that the data
 		and integrity tag doesn't match.
 	J - journaled writes - data and integrity tags are written to the
 		journal and atomicity is guaranteed. In case of crash,
 		either both data and tag or none of them are written. The
 		journaled mode degrades write throughput twice because the
 		data have to be written twice.
 	R - recovery mode - in this mode, journal is not replayed,
 		checksums are not checked and writes to the device are not
 		allowed. This mode is useful for data recovery if the
 		device cannot be activated in any of the other standard
 		modes.
 5. the number of additional arguments
 Additional arguments:
 journal_sectors:number
 	The size of journal, this argument is used only if formatting the
 	device. If the device is already formatted, the value from the
 	superblock is used.
 interleave_sectors:number
 	The number of interleaved sectors. This values is rounded down to
 	a power of two. If the device is already formatted, the value from
 	the superblock is used.
 buffer_sectors:number
 	The number of sectors in one buffer. The value is rounded down to
 	a power of two.
 	The tag area is accessed using buffers, the buffer size is
 	configurable. The large buffer size means that the I/O size will
 	be larger, but there could be less I/Os issued.
 journal_watermark:number
 	The journal watermark in percents. When the size of the journal
 	exceeds this watermark, the thread that flushes the journal will
 	be started.
 commit_time:number
 	Commit time in milliseconds. When this time passes, the journal is
 	written. The journal is also written immediatelly if the FLUSH
 	request is received.
 internal_hash:algorithm(:key)	(the key is optional)
 	Use internal hash or crc.
 	When this argument is used, the dm-integrity target won't accept
 	integrity tags from the upper target, but it will automatically
 	generate and verify the integrity tags.
 	You can use a crc algorithm (such as crc32), then integrity target
 	will protect the data against accidental corruption.
 	You can also use a hmac algorithm (for example
 	"hmac(sha256):0123456789abcdef"), in this mode it will provide
 	cryptographic authentication of the data without encryption.
 	When this argument is not used, the integrity tags are accepted
 	from an upper layer target, such as dm-crypt. The upper layer
 	target should check the validity of the integrity tags.
 journal_crypt:algorithm(:key)	(the key is optional)
 	Encrypt the journal using given algorithm to make sure that the
 	attacker can't read the journal. You can use a block cipher here
 	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
 	"salsa20", "ctr(aes)" or "ecb(arc4)").
 	The journal contains history of last writes to the block device,
 	an attacker reading the journal could see the last sector nubmers
 	that were written. From the sector numbers, the attacker can infer
 	the size of files that were written. To protect against this
 	situation, you can encrypt the journal.
 journal_mac:algorithm(:key)	(the key is optional)
 	Protect sector numbers in the journal from accidental or malicious
 	modification. To protect against accidental modification, use a
 	crc algorithm, to protect against malicious modification, use a
 	hmac algorithm with a key.
 	This option is not needed when using internal-hash because in this
 	mode, the integrity of journal entries is checked when replaying
 	the journal. Thus, modified sector number would be detected at
 	this stage.
 block_size:number
 	The size of a data block in bytes.  The larger the block size the
 	less overhead there is for per-block integrity metadata.
 	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
 	specified the default block size is 512 bytes.
 The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
 be changed when reloading the target (load an inactive table and swap the
 tables with suspend and resume). The other arguments should not be changed
 when reloading the target because the layout of disk data depend on them
 and the reloaded target would be non-functional.
 The layout of the formatted block device:
 * reserved sectors (they are not used by this target, they can be used for
  storing LUKS metadata or for other purpose), the size of the reserved
  area is specified in the target arguments
 * superblock (4kiB)
 	* magic string - identifies that the device was formatted
 	* version
 	* log2(interleave sectors)
 	* integrity tag size
 	* the number of journal sections
 	* provided data sectors - the number of sectors that this target
 	  provides (i.e. the size of the device minus the size of all
 	  metadata and padding). The user of this target should not send
 	  bios that access data beyond the "provided data sectors" limit.
 	* flags - a flag is set if journal_mac is used
 * journal
 	The journal is divided into sections, each section contains:
 	* metadata area (4kiB), it contains journal entries
 	  every journal entry contains:
 		* logical sector (specifies where the data and tag should
 		  be written)
 		* last 8 bytes of data
 		* integrity tag (the size is specified in the superblock)
 	    every metadata sector ends with
 		* mac (8-bytes), all the macs in 8 metadata sectors form a
 		  64-byte value. It is used to store hmac of sector
 		  numbers in the journal section, to protect against a
 		  possibility that the attacker tampers with sector
 		  numbers in the journal.
 		* commit id
 	* data area (the size is variable; it depends on how many journal
 	  entries fit into the metadata area)
 	    every sector in the data area contains:
 		* data (504 bytes of data, the last 8 bytes are stored in
 		  the journal entry)
 		* commit id
 	To test if the whole journal section was written correctly, every
 	512-byte sector of the journal ends with 8-byte commit id. If the
 	commit id matches on all sectors in a journal section, then it is
 	assumed that the section was written correctly. If the commit id
 	doesn't match, the section was written partially and it should not
 	be replayed.
 * one or more runs of interleaved tags and data. Each run contains:
 	* tag area - it contains integrity tags. There is one tag for each
 	  sector in the data area
 	* data area - it contains data sectors. The number of data sectors
 	  in one run must be a power of two. log2 of this value is stored
 	  in the superblock.
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
 		Takeover/reshape is not possible with a raid4/5/6 journal device;
 		it has to be deconfigured before requesting these.
 	[journal_mode <mode>]
 		This option sets the caching mode on journaled raid4/5/6 raid sets
 		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
 		If 'writeback' is selected the journal device has to be resilient
 		and must not suffer from the 'write hole' problem itself (e.g. use
 		raid1 or raid10) to avoid a single point of failure.
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
@@ -254,7 +261,8 @@ recovery.  Here is a fuller description of the individual fields:
 	<data_offset>   The current data offset to the start of the user data on
 			each component device of a raid set (see the respective
 			raid parameter to support out-of-place reshaping).
-	<journal_char>	'A' - active raid4/5/6 journal device.
+	<journal_char>	'A' - active write-through journal device.
 			'a' - active write-back journal device.
 			'D' - dead journal device.
 			'-' - no journal device.
@@ -331,3 +339,7 @@ Version History
 	'D' on the status line.  If '- -' is passed into the constructor, emit
 	'- -' on the table line and '-' as the status line health character.
 1.10.0  Add support for raid4/5/6 journal device
 1.10.1  Fix data corruption on reshape request
 1.11.0  Fix table line argument order
 	(wrong raid10_copies/raid10_format sequence)
 1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
--- a/Documentation/devicetree/bindings/arm/amlogic.txt
+++ b/Documentation/devicetree/bindings/arm/amlogic.txt
@@ -43,8 +43,11 @@ Board compatible values:
  - "wetek,hub" (Meson gxbb)
  - "wetek,play2" (Meson gxbb)
  - "amlogic,p212" (Meson gxl s905x)
  - "khadas,vim" (Meson gxl s905x)
  - "amlogic,p230" (Meson gxl s905d)
  - "amlogic,p231" (Meson gxl s905d)
  - "hwacom,amazetv" (Meson gxl s905x)
  - "amlogic,q200" (Meson gxm s912)
  - "amlogic,q201" (Meson gxm s912)
  - "nexbox,a95x" (Meson gxbb or Meson gxl s905x)
--- a/Documentation/devicetree/bindings/arm/atmel-at91.txt
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.txt
@@ -217,7 +217,8 @@ memory, bridge implementations, processor and other functionality not controlled
 elsewhere.
 required properties:
- compatible: Should be "atmel,<chip>-sfr", "syscon".
+- compatible: Should be "atmel,<chip>-sfr", "syscon" or
 	"atmel,<chip>-sfrbu", "syscon"
  <chip> can be "sama5d3", "sama5d4" or "sama5d2".
 - reg: Should contain registers location and length
--- a/Documentation/devicetree/bindings/arm/cavium-thunder2.txt
+++ b/Documentation/devicetree/bindings/arm/cavium-thunder2.txt
@@ -0,0 +1,8 @@
 Cavium ThunderX2 CN99XX platform tree bindings
 ----------------------------------------------
 Boards with Cavium ThunderX2 CN99XX SoC shall have the root property:
  compatible = "cavium,thunderx2-cn9900", "brcm,vulcan-soc";
 These SoC uses the "cavium,thunder2" core which will be compatible
 with "brcm,vulcan".
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -170,6 +170,7 @@ nodes to be present and contain the properties described below.
 			    "brcm,brahma-b15"
 			    "brcm,vulcan"
 			    "cavium,thunder"
 			    "cavium,thunder2"
 			    "faraday,fa526"
 			    "intel,sa110"
 			    "intel,sa1100"
--- a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
+++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
@@ -0,0 +1,31 @@
 OP-TEE Device Tree Bindings
 OP-TEE is a piece of software using hardware features to provide a Trusted
 Execution Environment. The security can be provided with ARM TrustZone, but
 also by virtualization or a separate chip.
 We're using "linaro" as the first part of the compatible property for
 the reference implementation maintained by Linaro.
 * OP-TEE based on ARM TrustZone required properties:
 - compatible     : should contain "linaro,optee-tz"
 - method         : The method of calling the OP-TEE Trusted OS. Permitted
                   values are:
                   "smc" : SMC #0, with the register assignments specified
 		           in drivers/tee/optee/optee_smc.h
                   "hvc" : HVC #0, with the register assignments specified
 		           in drivers/tee/optee/optee_smc.h
 Example:
 	firmware {
 		optee {
 			compatible = "linaro,optee-tz";
 			method = "smc";
 		};
 	};
--- a/Documentation/devicetree/bindings/arm/fsl.txt
+++ b/Documentation/devicetree/bindings/arm/fsl.txt
@@ -179,6 +179,18 @@ LS1046A ARMv8 based RDB Board
 Required root node properties:
    - compatible = "fsl,ls1046a-rdb", "fsl,ls1046a";
 LS1088A SoC
 Required root node properties:
    - compatible = "fsl,ls1088a";
 LS1088A ARMv8 based QDS Board
 Required root node properties:
    - compatible = "fsl,ls1088a-qds", "fsl,ls1088a";
 LS1088A ARMv8 based RDB Board
 Required root node properties:
    - compatible = "fsl,ls1088a-rdb", "fsl,ls1088a";
 LS2080A SoC
 Required root node properties:
    - compatible = "fsl,ls2080a";
@@ -195,3 +207,14 @@ LS2080A ARMv8 based RDB Board
 Required root node properties:
    - compatible = "fsl,ls2080a-rdb", "fsl,ls2080a";
 LS2088A SoC
 Required root node properties:
    - compatible = "fsl,ls2088a";
 LS2088A ARMv8 based QDS Board
 Required root node properties:
    - compatible = "fsl,ls2088a-qds", "fsl,ls2088a";
 LS2088A ARMv8 based RDB Board
 Required root node properties:
    - compatible = "fsl,ls2088a-rdb", "fsl,ls2088a";
--- a/Documentation/devicetree/bindings/arm/gemini.txt
+++ b/Documentation/devicetree/bindings/arm/gemini.txt
@@ -0,0 +1,86 @@
 Cortina systems Gemini platforms
 The Gemini SoC is the project name for an ARMv4 FA525-based SoC originally
 produced by Storlink Semiconductor around 2005. The company was renamed
 later renamed Storm Semiconductor. The chip product name is Storlink SL3516.
 It was derived from earlier products from Storm named SL3316 (Centroid) and
 SL3512 (Bulverde).
 Storm Semiconductor was acquired by Cortina Systems in 2008 and the SoC was
 produced and used for NAS and similar usecases. In 2014 Cortina Systems was
 in turn acquired by Inphi, who seem to have discontinued this product family.
 Many of the IP blocks used in the SoC comes from Faraday Technology.
 Required properties (in root node):
 	compatible = "cortina,gemini";
 Required nodes:
 - soc: the SoC should be represented by a simple bus encompassing all the
  onchip devices, this is referred to as the soc bus node.
 - syscon: the soc bus node must have a system controller node pointing to the
  global control registers, with the compatible string
  "cortina,gemini-syscon", "syscon";
 - timer: the soc bus node must have a timer node pointing to the SoC timer
  block, with the compatible string "cortina,gemini-timer"
  See: clocksource/cortina,gemini-timer.txt
 - interrupt-controller: the sob bus node must have an interrupt controller
  node pointing to the SoC interrupt controller block, with the compatible
  string "cortina,gemini-interrupt-controller"
  See interrupt-controller/cortina,gemini-interrupt-controller.txt
 Example:
 / {
 	model = "Foo Gemini Machine";
 	compatible = "cortina,gemini";
 	#address-cells = <1>;
 	#size-cells = <1>;
 	memory {
 		device_type = "memory";
 		reg = <0x00000000 0x8000000>;
 	};
 	soc {
 		#address-cells = <1>;
 		#size-cells = <1>;
 		ranges;
 		compatible = "simple-bus";
 		interrupt-parent = <&intcon>;
 		syscon: syscon@40000000 {
 			compatible = "cortina,gemini-syscon", "syscon";
 			reg = <0x40000000 0x1000>;
 		};
 		uart0: serial@42000000 {
 			compatible = "ns16550a";
 			reg = <0x42000000 0x100>;
 			clock-frequency = <48000000>;
 			interrupts = <18 IRQ_TYPE_LEVEL_HIGH>;
 			reg-shift = <2>;
 		};
 		timer@43000000 {
 			compatible = "cortina,gemini-timer";
 			reg = <0x43000000 0x1000>;
 			interrupt-parent = <&intcon>;
 			interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */
 				     <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */
 				     <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */
 			syscon = <&syscon>;
 		};
 		intcon: interrupt-controller@48000000 {
 			compatible = "cortina,gemini-interrupt-controller";
 			reg = <0x48000000 0x1000>;
 			interrupt-controller;
 			#interrupt-cells = <2>;
 		};
 	};
 };
--- a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt
+++ b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt
@@ -4,6 +4,14 @@ Hi3660 SoC
 Required root node properties:
 	- compatible = "hisilicon,hi3660";
 Hi3798cv200 SoC
 Required root node properties:
 	- compatible = "hisilicon,hi3798cv200";
 Hi3798cv200 Poplar Board
 Required root node properties:
 	- compatible = "hisilicon,hi3798cv200-poplar", "hisilicon,hi3798cv200";
 Hi4511 Board
 Required root node properties:
 	- compatible = "hisilicon,hi3620-hi4511";
--- a/Show More
+++ b/Show More