This is the 4.14.63 stable release

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAlt0UQ4ACgkQONu9yGCS
 aT48UBAAgRuM+14lp73NW+p9rX8VAupzk1k9QLP7/naWg6mIppVT2/NOEqMfLPYP
 xsLrkVdvMMlrTmf/LpNE4BEaE0YBD/CT/yK1NTrfLjhVoNIKhK9MEoAVyS7c3YlW
 OW+/jTuTyQ1BD3RkRXFquXVCnj4aDN+nWq4j7PIsvluNYiyqk7S8RzgUkunrRZTP
 ZU0EAi/cGcOYQ/iyPyKJLGyGgK8XmcjloSG75512D8SAoDymJIhvreqxFtcx9EVx
 Awe88RoeZos8/xuorlo+/vjKShnmesgPV2S+Nb1ih2Y3XCktXjASDBvaRX8kMsVh
 3F08OKir8ikUZx6lGnJluIDVNVXjXI6f823h8T7D/8r42r9k6EFHTWPU89dFFMda
 FUyLMf6Oy4TbUSxxKgNxLomaRASaXLP/kSPamAm4KLFvSOmFk6UgOAa1GsB1Q3ad
 6yE9KkDcyGxLu9p6NU6+5HgP42dbP5So3gOcLOgfRqR3+dallUqiZlmhbQkwZ1Wm
 kwDKyDAgbKCGnL7WEoZuSS542h1V9UXYMLWzw04zFaMIjF76H8ee9zBv8Y3ac/ms
 sM2hW6g7mfww7nvc1FPRQRDvkgfS4EE0DUzq72U0WKcK6v7ZRHUsmBQopxNVS7A4
 74HYcin2b5VQw3O3aKy0h3sqKnkE/WzA+XkFo4ExNwfiCzYZeQI=
 =LXnx
 -----END PGP SIGNATURE-----

Merge 4.14.63 into android-4.14-p

Changes in 4.14.63
	parisc: Enable CONFIG_MLONGCALLS by default
	parisc: Define mb() and add memory barriers to assembler unlock sequences
	scsi: hpsa: fix selection of reply queue
	scsi: core: introduce force_blk_mq
	scsi: virtio_scsi: fix IO hang caused by automatic irq vector affinity
	kasan: add no_sanitize attribute for clang builds
	Mark HI and TASKLET softirq synchronous
	stop_machine: Disable preemption after queueing stopper threads
	xen/netfront: don't cache skb_shinfo()
	scsi: sr: Avoid that opening a CD-ROM hangs with runtime power management enabled
	scsi: qla2xxx: Fix memory leak for allocating abort IOCB
	init: rename and re-order boot_cpu_state_init()
	root dentries need RCU-delayed freeing
	make sure that __dentry_kill() always invalidates d_seq, unhashed or not
	fix mntput/mntput race
	fix __legitimize_mnt()/mntput() race
	mtd: nand: qcom: Add a NULL check for devm_kasprintf()
	phy: phy-mtk-tphy: use auto instead of force to bypass utmi signals
	Bluetooth: hci_ldisc: Allow sleeping while proto locks are held.
	Bluetooth: hci_serdev: Init hci_uart proto_lock to avoid oops
	ARM: dts: imx6sx: fix irq for pcie bridge
	x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
	x86/speculation: Protect against userspace-userspace spectreRSB
	kprobes/x86: Fix %p uses in error messages
	x86/irqflags: Provide a declaration for native_save_fl
	x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT
	x86/speculation/l1tf: Change order of offset/type in swap entry
	x86/speculation/l1tf: Protect swap entries against L1TF
	x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation
	x86/speculation/l1tf: Make sure the first page is always reserved
	x86/speculation/l1tf: Add sysfs reporting for l1tf
	x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings
	x86/speculation/l1tf: Limit swap file size to MAX_PA/2
	x86/bugs: Move the l1tf function and define pr_fmt properly
	sched/smt: Update sched_smt_present at runtime
	x86/smp: Provide topology_is_primary_thread()
	x86/topology: Provide topology_smt_supported()
	cpu/hotplug: Make bringup/teardown of smp threads symmetric
	cpu/hotplug: Split do_cpu_down()
	cpu/hotplug: Provide knobs to control SMT
	x86/cpu: Remove the pointless CPU printout
	x86/cpu/AMD: Remove the pointless detect_ht() call
	x86/cpu/common: Provide detect_ht_early()
	x86/cpu/topology: Provide detect_extended_topology_early()
	x86/cpu/intel: Evaluate smp_num_siblings early
	x86/CPU/AMD: Do not check CPUID max ext level before parsing SMP info
	x86/cpu/AMD: Evaluate smp_num_siblings early
	x86/apic: Ignore secondary threads if nosmt=force
	x86/speculation/l1tf: Extend 64bit swap file size limit
	x86/cpufeatures: Add detection of L1D cache flush support.
	x86/CPU/AMD: Move TOPOEXT reenablement before reading smp_num_siblings
	x86/speculation/l1tf: Protect PAE swap entries against L1TF
	x86/speculation/l1tf: Fix up pte->pfn conversion for PAE
	Revert "x86/apic: Ignore secondary threads if nosmt=force"
	cpu/hotplug: Boot HT siblings at least once
	x86/KVM: Warn user if KVM is loaded SMT and L1TF CPU bug being present
	x86/KVM/VMX: Add module argument for L1TF mitigation
	x86/KVM/VMX: Add L1D flush algorithm
	x86/KVM/VMX: Add L1D MSR based flush
	x86/KVM/VMX: Add L1D flush logic
	x86/KVM/VMX: Split the VMX MSR LOAD structures to have an host/guest numbers
	x86/KVM/VMX: Add find_msr() helper function
	x86/KVM/VMX: Separate the VMX AUTOLOAD guest/host number accounting
	x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs
	x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required
	cpu/hotplug: Online siblings when SMT control is turned on
	x86/litf: Introduce vmx status variable
	x86/kvm: Drop L1TF MSR list approach
	x86/l1tf: Handle EPT disabled state proper
	x86/kvm: Move l1tf setup function
	x86/kvm: Add static key for flush always
	x86/kvm: Serialize L1D flush parameter setter
	x86/kvm: Allow runtime control of L1D flush
	cpu/hotplug: Expose SMT control init function
	cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early
	x86/bugs, kvm: Introduce boot-time control of L1TF mitigations
	Documentation: Add section about CPU vulnerabilities
	x86/KVM/VMX: Initialize the vmx_l1d_flush_pages' content
	Documentation/l1tf: Fix typos
	cpu/hotplug: detect SMT disabled by BIOS
	x86/KVM/VMX: Don't set l1tf_flush_l1d to true from vmx_l1d_flush()
	x86/KVM/VMX: Replace 'vmx_l1d_flush_always' with 'vmx_l1d_flush_cond'
	x86/KVM/VMX: Move the l1tf_flush_l1d test to vmx_l1d_flush()
	x86/irq: Demote irq_cpustat_t::__softirq_pending to u16
	x86/KVM/VMX: Introduce per-host-cpu analogue of l1tf_flush_l1d
	x86: Don't include linux/irq.h from asm/hardirq.h
	x86/irq: Let interrupt handlers set kvm_cpu_l1tf_flush_l1d
	x86/KVM/VMX: Don't set l1tf_flush_l1d from vmx_handle_external_intr()
	Documentation/l1tf: Remove Yonah processors from not vulnerable list
	KVM: x86: Add a framework for supporting MSR-based features
	KVM: SVM: Add MSR-based feature support for serializing LFENCE
	KVM: X86: Introduce kvm_get_msr_feature()
	KVM: X86: Allow userspace to define the microcode version
	KVM: VMX: support MSR_IA32_ARCH_CAPABILITIES as a feature MSR
	x86/speculation: Simplify sysfs report of VMX L1TF vulnerability
	x86/speculation: Use ARCH_CAPABILITIES to skip L1D flush on vmentry
	KVM: VMX: Tell the nested hypervisor to skip L1D flush on vmentry
	cpu/hotplug: Fix SMT supported evaluation
	x86/speculation/l1tf: Invert all not present mappings
	x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert
	x86/mm/pat: Make set_memory_np() L1TF safe
	x86/mm/kmmio: Make the tracer robust against L1TF
	tools headers: Synchronise x86 cpufeatures.h for L1TF additions
	x86/microcode: Allow late microcode loading with SMT disabled
	x86/smp: fix non-SMP broken build due to redefinition of apic_id_is_primary_thread
	cpu/hotplug: Non-SMP machines do not make use of booted_once
	x86/init: fix build with CONFIG_SWAP=n
	x86/speculation/l1tf: Unbreak !__HAVE_ARCH_PFN_MODIFY_ALLOWED architectures
	x86/CPU/AMD: Have smp_num_siblings and cpu_llc_id always be present
	Linux 4.14.63

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
tirimbino
Greg Kroah-Hartman 7 years ago
commit 8b11380a08
  1. 24
      Documentation/ABI/testing/sysfs-devices-system-cpu
  2. 9
      Documentation/admin-guide/index.rst
  3. 78
      Documentation/admin-guide/kernel-parameters.txt
  4. 610
      Documentation/admin-guide/l1tf.rst
  5. 40
      Documentation/virtual/kvm/api.txt
  6. 2
      Makefile
  7. 3
      arch/Kconfig
  8. 2
      arch/arm/boot/dts/imx6sx.dtsi
  9. 2
      arch/parisc/Kconfig
  10. 32
      arch/parisc/include/asm/barrier.h
  11. 2
      arch/parisc/kernel/entry.S
  12. 1
      arch/parisc/kernel/pacache.S
  13. 4
      arch/parisc/kernel/syscall.S
  14. 1
      arch/x86/Kconfig
  15. 10
      arch/x86/include/asm/apic.h
  16. 3
      arch/x86/include/asm/cpufeatures.h
  17. 2
      arch/x86/include/asm/dmi.h
  18. 26
      arch/x86/include/asm/hardirq.h
  19. 2
      arch/x86/include/asm/irqflags.h
  20. 9
      arch/x86/include/asm/kvm_host.h
  21. 7
      arch/x86/include/asm/msr-index.h
  22. 9
      arch/x86/include/asm/page_32_types.h
  23. 17
      arch/x86/include/asm/pgtable-2level.h
  24. 37
      arch/x86/include/asm/pgtable-3level.h
  25. 32
      arch/x86/include/asm/pgtable-invert.h
  26. 74
      arch/x86/include/asm/pgtable.h
  27. 38
      arch/x86/include/asm/pgtable_64.h
  28. 17
      arch/x86/include/asm/processor.h
  29. 1
      arch/x86/include/asm/smp.h
  30. 6
      arch/x86/include/asm/topology.h
  31. 11
      arch/x86/include/asm/vmx.h
  32. 19
      arch/x86/kernel/apic/apic.c
  33. 2
      arch/x86/kernel/apic/htirq.c
  34. 1
      arch/x86/kernel/apic/io_apic.c
  35. 1
      arch/x86/kernel/apic/msi.c
  36. 1
      arch/x86/kernel/apic/vector.c
  37. 59
      arch/x86/kernel/cpu/amd.c
  38. 171
      arch/x86/kernel/cpu/bugs.c
  39. 63
      arch/x86/kernel/cpu/common.c
  40. 2
      arch/x86/kernel/cpu/cpu.h
  41. 7
      arch/x86/kernel/cpu/intel.c
  42. 16
      arch/x86/kernel/cpu/microcode/core.c
  43. 41
      arch/x86/kernel/cpu/topology.c
  44. 1
      arch/x86/kernel/fpu/core.c
  45. 1
      arch/x86/kernel/ftrace.c
  46. 1
      arch/x86/kernel/hpet.c
  47. 1
      arch/x86/kernel/i8259.c
  48. 1
      arch/x86/kernel/idt.c
  49. 1
      arch/x86/kernel/irq.c
  50. 1
      arch/x86/kernel/irq_32.c
  51. 1
      arch/x86/kernel/irq_64.c
  52. 1
      arch/x86/kernel/irqinit.c
  53. 6
      arch/x86/kernel/kprobes/core.c
  54. 14
      arch/x86/kernel/paravirt.c
  55. 6
      arch/x86/kernel/setup.c
  56. 1
      arch/x86/kernel/smp.c
  57. 25
      arch/x86/kernel/smpboot.c
  58. 1
      arch/x86/kernel/time.c
  59. 1
      arch/x86/kvm/mmu.c
  60. 44
      arch/x86/kvm/svm.c
  61. 426
      arch/x86/kvm/vmx.c
  62. 133
      arch/x86/kvm/x86.c
  63. 1
      arch/x86/mm/fault.c
  64. 25
      arch/x86/mm/init.c
  65. 25
      arch/x86/mm/kmmio.c
  66. 21
      arch/x86/mm/mmap.c
  67. 8
      arch/x86/mm/pageattr.c
  68. 1
      arch/x86/mm/pti.c
  69. 1
      arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
  70. 1
      arch/x86/platform/uv/tlb_uv.c
  71. 1
      arch/x86/xen/enlighten.c
  72. 8
      drivers/base/cpu.c
  73. 38
      drivers/bluetooth/hci_ldisc.c
  74. 1
      drivers/bluetooth/hci_serdev.c
  75. 2
      drivers/bluetooth/hci_uart.h
  76. 1
      drivers/gpu/drm/i915/intel_lpe_audio.c
  77. 3
      drivers/mtd/nand/qcom_nandc.c
  78. 8
      drivers/net/xen-netfront.c
  79. 2
      drivers/pci/host/pci-hyperv.c
  80. 19
      drivers/phy/mediatek/phy-mtk-tphy.c
  81. 1
      drivers/scsi/hosts.c
  82. 73
      drivers/scsi/hpsa.c
  83. 1
      drivers/scsi/hpsa.h
  84. 53
      drivers/scsi/qla2xxx/qla_iocb.c
  85. 29
      drivers/scsi/sr.c
  86. 59
      drivers/scsi/virtio_scsi.c
  87. 13
      fs/dcache.c
  88. 28
      fs/namespace.c
  89. 12
      include/asm-generic/pgtable.h
  90. 3
      include/linux/compiler-clang.h
  91. 23
      include/linux/cpu.h
  92. 2
      include/linux/swapfile.h
  93. 3
      include/scsi/scsi_host.h
  94. 2
      include/uapi/linux/kvm.h
  95. 2
      init/main.c
  96. 284
      kernel/cpu.c
  97. 30
      kernel/sched/core.c
  98. 1
      kernel/sched/fair.c
  99. 2
      kernel/smp.c
  100. 12
      kernel/softirq.c
  101. Some files were not shown because too many files have changed in this diff Show More

@ -379,6 +379,7 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/l1tf
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@ -390,3 +391,26 @@ Description: Information about CPU vulnerabilities
"Not affected" CPU is not affected by the vulnerability
"Vulnerable" CPU is affected and no mitigation in effect
"Mitigation: $M" CPU is affected and mitigation $M is in effect
Details about the l1tf file can be found in
Documentation/admin-guide/l1tf.rst
What: /sys/devices/system/cpu/smt
/sys/devices/system/cpu/smt/active
/sys/devices/system/cpu/smt/control
Date: June 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Control Symetric Multi Threading (SMT)
active: Tells whether SMT is active (enabled and siblings online)
control: Read/write interface to control SMT. Possible
values:
"on" SMT is enabled
"off" SMT is disabled
"forceoff" SMT is force disabled. Cannot be changed.
"notsupported" SMT is not supported by the CPU
If control status is "forceoff" or "notsupported" writes
are rejected.

@ -17,6 +17,15 @@ etc.
kernel-parameters
devices
This section describes CPU vulnerabilities and provides an overview of the
possible mitigations along with guidance for selecting mitigations if they
are configurable at compile, boot or run time.
.. toctree::
:maxdepth: 1
l1tf
Here is a set of documents aimed at users who are trying to track down
problems and bugs in particular.

@ -1891,10 +1891,84 @@
(virtualized real and unpaged mode) on capable
Intel chips. Default is 1 (enabled)
kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
CVE-2018-3620.
Valid arguments: never, cond, always
always: L1D cache flush on every VMENTER.
cond: Flush L1D on VMENTER only when the code between
VMEXIT and VMENTER can leak host memory.
never: Disables the mitigation
Default is cond (do L1 cache flush in specific instances)
kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
feature (tagged TLBs) on capable Intel chips.
Default is 1 (enabled)
l1tf= [X86] Control mitigation of the L1TF vulnerability on
affected CPUs
The kernel PTE inversion protection is unconditionally
enabled and cannot be disabled.
full
Provides all available mitigations for the
L1TF vulnerability. Disables SMT and
enables all mitigations in the
hypervisors, i.e. unconditional L1D flush.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
full,force
Same as 'full', but disables SMT and L1D
flush runtime control. Implies the
'nosmt=force' command line option.
(i.e. sysfs control of SMT is disabled.)
flush
Leaves SMT enabled and enables the default
hypervisor mitigation, i.e. conditional
L1D flush.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
flush,nosmt
Disables SMT and enables the default
hypervisor mitigation.
SMT control and L1D flush control via the
sysfs interface is still possible after
boot. Hypervisors will issue a warning
when the first VM is started in a
potentially insecure configuration,
i.e. SMT enabled or L1D flush disabled.
flush,nowarn
Same as 'flush', but hypervisors will not
warn when a VM is started in a potentially
insecure configuration.
off
Disables hypervisor mitigations and doesn't
emit any warnings.
Default is 'flush'.
For details see: Documentation/admin-guide/l1tf.rst
l2cr= [PPC]
l3cr= [PPC]
@ -2598,6 +2672,10 @@
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
[KNL,x86] Disable symmetric multithreading (SMT).
nosmt=force: Force disable SMT, cannot be undone
via the sysfs control file.
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
(indirect branch prediction) vulnerability. System may
allow data leaks with this option, which is equivalent

@ -0,0 +1,610 @@
L1TF - L1 Terminal Fault
========================
L1 Terminal Fault is a hardware vulnerability which allows unprivileged
speculative access to data which is available in the Level 1 Data Cache
when the page table entry controlling the virtual address, which is used
for the access, has the Present bit cleared or other reserved bits set.
Affected processors
-------------------
This vulnerability affects a wide range of Intel processors. The
vulnerability is not present on:
- Processors from AMD, Centaur and other non Intel vendors
- Older processor models, where the CPU family is < 6
- A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
Penwell, Pineview, Silvermont, Airmont, Merrifield)
- The Intel XEON PHI family
- Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
by the Meltdown vulnerability either. These CPUs should become
available by end of 2018.
Whether a processor is affected or not can be read out from the L1TF
vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
Related CVEs
------------
The following CVE entries are related to the L1TF vulnerability:
============= ================= ==============================
CVE-2018-3615 L1 Terminal Fault SGX related aspects
CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
============= ================= ==============================
Problem
-------
If an instruction accesses a virtual address for which the relevant page
table entry (PTE) has the Present bit cleared or other reserved bits set,
then speculative execution ignores the invalid PTE and loads the referenced
data if it is present in the Level 1 Data Cache, as if the page referenced
by the address bits in the PTE was still present and accessible.
While this is a purely speculative mechanism and the instruction will raise
a page fault when it is retired eventually, the pure act of loading the
data and making it available to other speculative instructions opens up the
opportunity for side channel attacks to unprivileged malicious code,
similar to the Meltdown attack.
While Meltdown breaks the user space to kernel space protection, L1TF
allows to attack any physical memory address in the system and the attack
works across all protection domains. It allows an attack of SGX and also
works from inside virtual machines because the speculation bypasses the
extended page table (EPT) protection mechanism.
Attack scenarios
----------------
1. Malicious user space
^^^^^^^^^^^^^^^^^^^^^^^
Operating Systems store arbitrary information in the address bits of a
PTE which is marked non present. This allows a malicious user space
application to attack the physical memory to which these PTEs resolve.
In some cases user-space can maliciously influence the information
encoded in the address bits of the PTE, thus making attacks more
deterministic and more practical.
The Linux kernel contains a mitigation for this attack vector, PTE
inversion, which is permanently enabled and has no performance
impact. The kernel ensures that the address bits of PTEs, which are not
marked present, never point to cacheable physical memory space.
A system with an up to date kernel is protected against attacks from
malicious user space applications.
2. Malicious guest in a virtual machine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The fact that L1TF breaks all domain protections allows malicious guest
OSes, which can control the PTEs directly, and malicious guest user
space applications, which run on an unprotected guest kernel lacking the
PTE inversion mitigation for L1TF, to attack physical host memory.
A special aspect of L1TF in the context of virtualization is symmetric
multi threading (SMT). The Intel implementation of SMT is called
HyperThreading. The fact that Hyperthreads on the affected processors
share the L1 Data Cache (L1D) is important for this. As the flaw allows
only to attack data which is present in L1D, a malicious guest running
on one Hyperthread can attack the data which is brought into the L1D by
the context which runs on the sibling Hyperthread of the same physical
core. This context can be host OS, host user space or a different guest.
If the processor does not support Extended Page Tables, the attack is
only possible, when the hypervisor does not sanitize the content of the
effective (shadow) page tables.
While solutions exist to mitigate these attack vectors fully, these
mitigations are not enabled by default in the Linux kernel because they
can affect performance significantly. The kernel provides several
mechanisms which can be utilized to address the problem depending on the
deployment scenario. The mitigations, their protection scope and impact
are described in the next sections.
The default mitigations and the rationale for choosing them are explained
at the end of this document. See :ref:`default_mitigations`.
.. _l1tf_sys_info:
L1TF system information
-----------------------
The Linux kernel provides a sysfs interface to enumerate the current L1TF
status of the system: whether the system is vulnerable, and which
mitigations are active. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/l1tf
The possible values in this file are:
=========================== ===============================
'Not affected' The processor is not vulnerable
'Mitigation: PTE Inversion' The host protection is active
=========================== ===============================
If KVM/VMX is enabled and the processor is vulnerable then the following
information is appended to the 'Mitigation: PTE Inversion' part:
- SMT status:
===================== ================
'VMX: SMT vulnerable' SMT is enabled
'VMX: SMT disabled' SMT is disabled
===================== ================
- L1D Flush mode:
================================ ====================================
'L1D vulnerable' L1D flushing is disabled
'L1D conditional cache flushes' L1D flush is conditionally enabled
'L1D cache flushes' L1D flush is unconditionally enabled
================================ ====================================
The resulting grade of protection is discussed in the following sections.
Host mitigation mechanism
-------------------------
The kernel is unconditionally protected against L1TF attacks from malicious
user space running on the host.
Guest mitigation mechanisms
---------------------------
.. _l1d_flush:
1. L1D flush on VMENTER
^^^^^^^^^^^^^^^^^^^^^^^
To make sure that a guest cannot attack data which is present in the L1D
the hypervisor flushes the L1D before entering the guest.
Flushing the L1D evicts not only the data which should not be accessed
by a potentially malicious guest, it also flushes the guest
data. Flushing the L1D has a performance impact as the processor has to
bring the flushed guest data back into the L1D. Depending on the
frequency of VMEXIT/VMENTER and the type of computations in the guest
performance degradation in the range of 1% to 50% has been observed. For
scenarios where guest VMEXIT/VMENTER are rare the performance impact is
minimal. Virtio and mechanisms like posted interrupts are designed to
confine the VMEXITs to a bare minimum, but specific configurations and
application scenarios might still suffer from a high VMEXIT rate.
The kernel provides two L1D flush modes:
- conditional ('cond')
- unconditional ('always')
The conditional mode avoids L1D flushing after VMEXITs which execute
only audited code paths before the corresponding VMENTER. These code
paths have been verified that they cannot expose secrets or other
interesting data to an attacker, but they can leak information about the
address space layout of the hypervisor.
Unconditional mode flushes L1D on all VMENTER invocations and provides
maximum protection. It has a higher overhead than the conditional
mode. The overhead cannot be quantified correctly as it depends on the
workload scenario and the resulting number of VMEXITs.
The general recommendation is to enable L1D flush on VMENTER. The kernel
defaults to conditional mode on affected processors.
**Note**, that L1D flush does not prevent the SMT problem because the
sibling thread will also bring back its data into the L1D which makes it
attackable again.
L1D flush can be controlled by the administrator via the kernel command
line and sysfs control files. See :ref:`mitigation_control_command_line`
and :ref:`mitigation_control_kvm`.
.. _guest_confinement:
2. Guest VCPU confinement to dedicated physical cores
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To address the SMT problem, it is possible to make a guest or a group of
guests affine to one or more physical cores. The proper mechanism for
that is to utilize exclusive cpusets to ensure that no other guest or
host tasks can run on these cores.
If only a single guest or related guests run on sibling SMT threads on
the same physical core then they can only attack their own memory and
restricted parts of the host memory.
Host memory is attackable, when one of the sibling SMT threads runs in
host OS (hypervisor) context and the other in guest context. The amount
of valuable information from the host OS context depends on the context
which the host OS executes, i.e. interrupts, soft interrupts and kernel
threads. The amount of valuable data from these contexts cannot be
declared as non-interesting for an attacker without deep inspection of
the code.
**Note**, that assigning guests to a fixed set of physical cores affects
the ability of the scheduler to do load balancing and might have
negative effects on CPU utilization depending on the hosting
scenario. Disabling SMT might be a viable alternative for particular
scenarios.
For further information about confining guests to a single or to a group
of cores consult the cpusets documentation:
https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
.. _interrupt_isolation:
3. Interrupt affinity
^^^^^^^^^^^^^^^^^^^^^
Interrupts can be made affine to logical CPUs. This is not universally
true because there are types of interrupts which are truly per CPU
interrupts, e.g. the local timer interrupt. Aside of that multi queue
devices affine their interrupts to single CPUs or groups of CPUs per
queue without allowing the administrator to control the affinities.
Moving the interrupts, which can be affinity controlled, away from CPUs
which run untrusted guests, reduces the attack vector space.
Whether the interrupts with are affine to CPUs, which run untrusted
guests, provide interesting data for an attacker depends on the system
configuration and the scenarios which run on the system. While for some
of the interrupts it can be assumed that they won't expose interesting
information beyond exposing hints about the host OS memory layout, there
is no way to make general assumptions.
Interrupt affinity can be controlled by the administrator via the
/proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
available at:
https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
.. _smt_control:
4. SMT control
^^^^^^^^^^^^^^
To prevent the SMT issues of L1TF it might be necessary to disable SMT
completely. Disabling SMT can have a significant performance impact, but
the impact depends on the hosting scenario and the type of workloads.
The impact of disabling SMT needs also to be weighted against the impact
of other mitigation solutions like confining guests to dedicated cores.
The kernel provides a sysfs interface to retrieve the status of SMT and
to control it. It also provides a kernel command line interface to
control SMT.
The kernel command line interface consists of the following options:
=========== ==========================================================
nosmt Affects the bring up of the secondary CPUs during boot. The
kernel tries to bring all present CPUs online during the
boot process. "nosmt" makes sure that from each physical
core only one - the so called primary (hyper) thread is
activated. Due to a design flaw of Intel processors related
to Machine Check Exceptions the non primary siblings have
to be brought up at least partially and are then shut down
again. "nosmt" can be undone via the sysfs interface.
nosmt=force Has the same effect as "nosmt" but it does not allow to
undo the SMT disable via the sysfs interface.
=========== ==========================================================
The sysfs interface provides two files:
- /sys/devices/system/cpu/smt/control
- /sys/devices/system/cpu/smt/active
/sys/devices/system/cpu/smt/control:
This file allows to read out the SMT control state and provides the
ability to disable or (re)enable SMT. The possible states are:
============== ===================================================
on SMT is supported by the CPU and enabled. All
logical CPUs can be onlined and offlined without
restrictions.
off SMT is supported by the CPU and disabled. Only
the so called primary SMT threads can be onlined
and offlined without restrictions. An attempt to
online a non-primary sibling is rejected
forceoff Same as 'off' but the state cannot be controlled.
Attempts to write to the control file are rejected.
notsupported The processor does not support SMT. It's therefore
not affected by the SMT implications of L1TF.
Attempts to write to the control file are rejected.
============== ===================================================
The possible states which can be written into this file to control SMT
state are:
- on
- off
- forceoff
/sys/devices/system/cpu/smt/active:
This file reports whether SMT is enabled and active, i.e. if on any
physical core two or more sibling threads are online.
SMT control is also possible at boot time via the l1tf kernel command
line parameter in combination with L1D flush control. See
:ref:`mitigation_control_command_line`.
5. Disabling EPT
^^^^^^^^^^^^^^^^
Disabling EPT for virtual machines provides full mitigation for L1TF even
with SMT enabled, because the effective page tables for guests are
managed and sanitized by the hypervisor. Though disabling EPT has a
significant performance impact especially when the Meltdown mitigation
KPTI is enabled.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
There is ongoing research and development for new mitigation mechanisms to
address the performance impact of disabling SMT or EPT.
.. _mitigation_control_command_line:
Mitigation control on the kernel command line
---------------------------------------------
The kernel command line allows to control the L1TF mitigations at boot
time with the option "l1tf=". The valid arguments for this option are:
============ =============================================================
full Provides all available mitigations for the L1TF
vulnerability. Disables SMT and enables all mitigations in
the hypervisors, i.e. unconditional L1D flushing
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
full,force Same as 'full', but disables SMT and L1D flush runtime
control. Implies the 'nosmt=force' command line option.
(i.e. sysfs control of SMT is disabled.)
flush Leaves SMT enabled and enables the default hypervisor
mitigation, i.e. conditional L1D flushing
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
flush,nosmt Disables SMT and enables the default hypervisor mitigation,
i.e. conditional L1D flushing.
SMT control and L1D flush control via the sysfs interface
is still possible after boot. Hypervisors will issue a
warning when the first VM is started in a potentially
insecure configuration, i.e. SMT enabled or L1D flush
disabled.
flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
started in a potentially insecure configuration.
off Disables hypervisor mitigations and doesn't emit any
warnings.
============ =============================================================
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
.. _mitigation_control_kvm:
Mitigation control for KVM - module parameter
-------------------------------------------------------------
The KVM hypervisor mitigation mechanism, flushing the L1D cache when
entering a guest, can be controlled with a module parameter.
The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
following arguments:
============ ==============================================================
always L1D cache flush on every VMENTER.
cond Flush L1D on VMENTER only when the code between VMEXIT and
VMENTER can leak host memory which is considered
interesting for an attacker. This still can leak host memory
which allows e.g. to determine the hosts address space layout.
never Disables the mitigation
============ ==============================================================
The parameter can be provided on the kernel command line, as a module
parameter when loading the modules and at runtime modified via the sysfs
file:
/sys/module/kvm_intel/parameters/vmentry_l1d_flush
The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
module parameter is ignored and writes to the sysfs file are rejected.
Mitigation selection guide
--------------------------
1. No virtualization in use
^^^^^^^^^^^^^^^^^^^^^^^^^^^
The system is protected by the kernel unconditionally and no further
action is required.
2. Virtualization with trusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the guest comes from a trusted source and the guest OS kernel is
guaranteed to have the L1TF mitigations in place the system is fully
protected against L1TF and no further action is required.
To avoid the overhead of the default L1D flushing on VMENTER the
administrator can disable the flushing via the kernel command line and
sysfs control files. See :ref:`mitigation_control_command_line` and
:ref:`mitigation_control_kvm`.
3. Virtualization with untrusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3.1. SMT not supported or disabled
""""""""""""""""""""""""""""""""""
If SMT is not supported by the processor or disabled in the BIOS or by
the kernel, it's only required to enforce L1D flushing on VMENTER.
Conditional L1D flushing is the default behaviour and can be tuned. See
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
3.2. EPT not supported or disabled
""""""""""""""""""""""""""""""""""
If EPT is not supported by the processor or disabled in the hypervisor,
the system is fully protected. SMT can stay enabled and L1D flushing on
VMENTER is not required.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
3.3. SMT and EPT supported and active
"""""""""""""""""""""""""""""""""""""
If SMT and EPT are supported and active then various degrees of
mitigations can be employed:
- L1D flushing on VMENTER:
L1D flushing on VMENTER is the minimal protection requirement, but it
is only potent in combination with other mitigation methods.
Conditional L1D flushing is the default behaviour and can be tuned. See
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
- Guest confinement:
Confinement of guests to a single or a group of physical cores which
are not running any other processes, can reduce the attack surface
significantly, but interrupts, soft interrupts and kernel threads can
still expose valuable data to a potential attacker. See
:ref:`guest_confinement`.
- Interrupt isolation:
Isolating the guest CPUs from interrupts can reduce the attack surface
further, but still allows a malicious guest to explore a limited amount
of host physical memory. This can at least be used to gain knowledge
about the host address space layout. The interrupts which have a fixed
affinity to the CPUs which run the untrusted guests can depending on
the scenario still trigger soft interrupts and schedule kernel threads
which might expose valuable information. See
:ref:`interrupt_isolation`.
The above three mitigation methods combined can provide protection to a
certain degree, but the risk of the remaining attack surface has to be
carefully analyzed. For full protection the following methods are
available:
- Disabling SMT:
Disabling SMT and enforcing the L1D flushing provides the maximum
amount of protection. This mitigation is not depending on any of the
above mitigation methods.
SMT control and L1D flushing can be tuned by the command line
parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
time with the matching sysfs control files. See :ref:`smt_control`,
:ref:`mitigation_control_command_line` and
:ref:`mitigation_control_kvm`.
- Disabling EPT:
Disabling EPT provides the maximum amount of protection as well. It is
not depending on any of the above mitigation methods. SMT can stay
enabled and L1D flushing is not required, but the performance impact is
significant.
EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
parameter.
3.4. Nested virtual machines
""""""""""""""""""""""""""""
When nested virtualization is in use, three operating systems are involved:
the bare metal hypervisor, the nested hypervisor and the nested virtual
machine. VMENTER operations from the nested hypervisor into the nested
guest will always be processed by the bare metal hypervisor. If KVM is the
bare metal hypervisor it wiil:
- Flush the L1D cache on every switch from the nested hypervisor to the
nested virtual machine, so that the nested hypervisor's secrets are not
exposed to the nested virtual machine;
- Flush the L1D cache on every switch from the nested virtual machine to
the nested hypervisor; this is a complex operation, and flushing the L1D
cache avoids that the bare metal hypervisor's secrets are exposed to the
nested virtual machine;
- Instruct the nested hypervisor to not perform any L1D cache flush. This
is an optimization to avoid double L1D flushing.
.. _default_mitigations:
Default mitigations
-------------------
The kernel default mitigations for vulnerable processors are:
- PTE inversion to protect against malicious user space. This is done
unconditionally and cannot be controlled.
- L1D conditional flushing on VMENTER when EPT is enabled for
a guest.
The kernel does not by default enforce the disabling of SMT, which leaves
SMT systems vulnerable when running untrusted guests with EPT enabled.
The rationale for this choice is:
- Force disabling SMT can break existing setups, especially with
unattended updates.
- If regular users run untrusted guests on their machine, then L1TF is
just an add on to other malware which might be embedded in an untrusted
guest, e.g. spam-bots or attacks on the local network.
There is no technical way to prevent a user from running untrusted code
on their machines blindly.
- It's technically extremely unlikely and from today's knowledge even
impossible that L1TF can be exploited via the most popular attack
mechanisms like JavaScript because these mechanisms have no way to
control PTEs. If this would be possible and not other mitigation would
be possible, then the default might be different.
- The administrators of cloud and hosting setups have to carefully
analyze the risk for their scenarios and make the appropriate
mitigation choices, which might even vary across their deployed
machines and also result in other changes of their overall setup.
There is no way for the kernel to provide a sensible default for this
kind of scenarios.

@ -123,14 +123,15 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
flag KVM_VM_MIPS_VZ.
4.3 KVM_GET_MSR_INDEX_LIST
4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
Capability: basic
Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
Architectures: x86
Type: system
Type: system ioctl
Parameters: struct kvm_msr_list (in/out)
Returns: 0 on success; -1 on error
Errors:
EFAULT: the msr index list cannot be read from or written to
E2BIG: the msr index list is to be to fit in the array specified by
the user.
@ -139,16 +140,23 @@ struct kvm_msr_list {
__u32 indices[0];
};
This ioctl returns the guest msrs that are supported. The list varies
by kvm version and host processor, but does not change otherwise. The
user fills in the size of the indices array in nmsrs, and in return
kvm adjusts nmsrs to reflect the actual number of msrs and fills in
the indices array with their numbers.
The user fills in the size of the indices array in nmsrs, and in return
kvm adjusts nmsrs to reflect the actual number of msrs and fills in the
indices array with their numbers.
KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list
varies by kvm version and host processor, but does not change otherwise.
Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
not returned in the MSR list, as different vcpus can have a different number
of banks, as set via the KVM_X86_SETUP_MCE ioctl.
KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed
to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities
and processor features that are exposed via MSRs (e.g., VMX capabilities).
This list also varies by kvm version and host processor, but does not change
otherwise.
4.4 KVM_CHECK_EXTENSION
@ -475,14 +483,22 @@ Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
4.18 KVM_GET_MSRS
Capability: basic
Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system)
Architectures: x86
Type: vcpu ioctl
Type: system ioctl, vcpu ioctl
Parameters: struct kvm_msrs (in/out)
Returns: 0 on success, -1 on error
Returns: number of msrs successfully returned;
-1 on error
When used as a system ioctl:
Reads the values of MSR-based features that are available for the VM. This
is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values.
The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST
in a system ioctl.
When used as a vcpu ioctl:
Reads model-specific registers from the vcpu. Supported msr indices can
be obtained using KVM_GET_MSR_INDEX_LIST.
be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl.
struct kvm_msrs {
__u32 nmsrs; /* number of msrs in entries */

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 14
SUBLEVEL = 62
SUBLEVEL = 63
EXTRAVERSION =
NAME = Petit Gorille

@ -13,6 +13,9 @@ config KEXEC_CORE
config HAVE_IMA_KEXEC
bool
config HOTPLUG_SMT
bool
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING

@ -1305,7 +1305,7 @@
0x82000000 0 0x08000000 0x08000000 0 0x00f00000>;
bus-range = <0x00 0xff>;
num-lanes = <1>;
interrupts = <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clks IMX6SX_CLK_PCIE_REF_125M>,
<&clks IMX6SX_CLK_PCIE_AXI>,
<&clks IMX6SX_CLK_LVDS1_OUT>,

@ -201,7 +201,7 @@ config PREFETCH
config MLONGCALLS
bool "Enable the -mlong-calls compiler option for big kernels"
def_bool y if (!MODULES)
default y
depends on PA8X00
help
If you configure the kernel to include many drivers built-in instead

@ -0,0 +1,32 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_BARRIER_H
#define __ASM_BARRIER_H
#ifndef __ASSEMBLY__
/* The synchronize caches instruction executes as a nop on systems in
which all memory references are performed in order. */
#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
#if defined(CONFIG_SMP)
#define mb() do { synchronize_caches(); } while (0)
#define rmb() mb()
#define wmb() mb()
#define dma_rmb() mb()
#define dma_wmb() mb()
#else
#define mb() barrier()
#define rmb() barrier()
#define wmb() barrier()
#define dma_rmb() barrier()
#define dma_wmb() barrier()
#endif
#define __smp_mb() mb()
#define __smp_rmb() mb()
#define __smp_wmb() mb()
#include <asm-generic/barrier.h>
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_BARRIER_H */

@ -481,6 +481,8 @@
/* Release pa_tlb_lock lock without reloading lock address. */
.macro tlb_unlock0 spc,tmp
#ifdef CONFIG_SMP
or,COND(=) %r0,\spc,%r0
sync
or,COND(=) %r0,\spc,%r0
stw \spc,0(\tmp)
#endif

@ -354,6 +354,7 @@ ENDPROC_CFI(flush_data_cache_local)
.macro tlb_unlock la,flags,tmp
#ifdef CONFIG_SMP
ldi 1,\tmp
sync
stw \tmp,0(\la)
mtsm \flags
#endif

@ -633,6 +633,7 @@ cas_action:
sub,<> %r28, %r25, %r0
2: stw,ma %r24, 0(%r26)
/* Free lock */
sync
stw,ma %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
/* Clear thread register indicator */
@ -647,6 +648,7 @@ cas_action:
3:
/* Error occurred on load or store */
/* Free lock */
sync
stw %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
stw %r0, 4(%sr2,%r20)
@ -848,6 +850,7 @@ cas2_action:
cas2_end:
/* Free lock */
sync
stw,ma %r20, 0(%sr2,%r20)
/* Enable interrupts */
ssm PSW_SM_I, %r0
@ -858,6 +861,7 @@ cas2_end:
22:
/* Error occurred on load or store */
/* Free lock */
sync
stw %r20, 0(%sr2,%r20)
ssm PSW_SM_I, %r0
ldo 1(%r0),%r28

@ -176,6 +176,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
select PCI_LOCKLESS_CONFIG
select PERF_EVENTS

@ -10,6 +10,7 @@
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/hardirq.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@ -613,12 +614,20 @@ extern int default_check_phys_apicid_present(int phys_apicid);
#endif
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_SMP
bool apic_id_is_primary_thread(unsigned int id);
#else
static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
#endif
extern void irq_enter(void);
extern void irq_exit(void);
static inline void entering_irq(void)
{
irq_enter();
kvm_set_cpu_l1tf_flush_l1d();
}
static inline void entering_ack_irq(void)
@ -631,6 +640,7 @@ static inline void ipi_entering_ack_irq(void)
{
irq_enter();
ack_APIC_irq();
kvm_set_cpu_l1tf_flush_l1d();
}
static inline void exiting_irq(void)

@ -219,6 +219,7 @@
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@ -338,6 +339,7 @@
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
@ -370,5 +372,6 @@
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
#endif /* _ASM_X86_CPUFEATURES_H */

@ -4,8 +4,8 @@
#include <linux/compiler.h>
#include <linux/init.h>
#include <linux/io.h>
#include <asm/io.h>
#include <asm/setup.h>
static __always_inline __init void *dmi_alloc(unsigned len)

@ -3,10 +3,12 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
#include <linux/irq.h>
typedef struct {
unsigned int __softirq_pending;
u16 __softirq_pending;
#if IS_ENABLED(CONFIG_KVM_INTEL)
u8 kvm_cpu_l1tf_flush_l1d;
#endif
unsigned int __nmi_count; /* arch dependent */
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
@ -62,4 +64,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
#if IS_ENABLED(CONFIG_KVM_INTEL)
static inline void kvm_set_cpu_l1tf_flush_l1d(void)
{
__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
}
static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
{
__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
}
static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
{
return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
}
#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
#endif /* _ASM_X86_HARDIRQ_H */

@ -13,6 +13,8 @@
* Interrupt control:
*/
/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern inline unsigned long native_save_fl(void)
{
unsigned long flags;

@ -17,6 +17,7 @@
#include <linux/tracepoint.h>
#include <linux/cpumask.h>
#include <linux/irq_work.h>
#include <linux/irq.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@ -506,6 +507,7 @@ struct kvm_vcpu_arch {
u64 smbase;
bool tpr_access_reporting;
u64 ia32_xss;
u64 microcode_version;
/*
* Paging state of the vcpu
@ -693,6 +695,9 @@ struct kvm_vcpu_arch {
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
bool l1tf_flush_l1d;
};
struct kvm_lpage_info {
@ -862,6 +867,7 @@ struct kvm_vcpu_stat {
u64 signal_exits;
u64 irq_window_exits;
u64 nmi_window_exits;
u64 l1d_flush;
u64 halt_exits;
u64 halt_successful_poll;
u64 halt_attempted_poll;
@ -1061,6 +1067,8 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
};
struct kvm_arch_async_pf {
@ -1366,6 +1374,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
u64 kvm_get_arch_capabilities(void);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);

@ -70,12 +70,19 @@
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
#define ARCH_CAP_SSB_NO (1 << 4) /*
* Not susceptible to Speculative Store Bypass
* attack, so no Speculative Store Bypass
* control required.
*/
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH (1 << 0) /*
* Writeback and invalidate the
* L1 data cache.
*/
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e

@ -29,8 +29,13 @@
#define N_EXCEPTION_STACKS 1
#ifdef CONFIG_X86_PAE
/* 44=32+12, the limit we can fit into an unsigned long pfn */
#define __PHYSICAL_MASK_SHIFT 44
/*
* This is beyond the 44 bit limit imposed by the 32bit long pfns,
* but we need the full mask to make sure inverted PROT_NONE
* entries have all the host bits set in a guest.
* The real limit is still 44 bits.
*/
#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 32
#else /* !CONFIG_X86_PAE */

@ -95,4 +95,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
/* No inverted PFNs on 2 level page tables */
static inline u64 protnone_mask(u64 val)
{
return 0;
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
return val;
}
static inline bool __pte_needs_invert(u64 val)
{
return false;
}
#endif /* _ASM_X86_PGTABLE_2LEVEL_H */

@ -206,12 +206,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#endif
/* Encode and de-code a swap entry */
#define SWP_TYPE_BITS 5
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
#define __swp_offset(x) ((x).val >> 5)
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
/*
* Normally, __swp_entry() converts from arch-independent swp_entry_t to
* arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
* to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
* whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
* __swp_entry_to_pte() through the following helper macro based on 64bit
* __swp_entry().
*/
#define __swp_pteval_entry(type, offset) ((pteval_t) { \
(~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
| ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
__swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
/*
* Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
* swp_entry_t, but also has to convert it from 64bit to the 32bit
* intermediate representation, using the following macros based on 64bit
* __swp_type() and __swp_offset().
*/
#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
__pteval_swp_offset(pte)))
#define gup_get_pte gup_get_pte
/*
@ -260,4 +291,6 @@ static inline pte_t gup_get_pte(pte_t *ptep)
return pte;
}
#include <asm/pgtable-invert.h>
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */

@ -0,0 +1,32 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1
#ifndef __ASSEMBLY__
static inline bool __pte_needs_invert(u64 val)
{
return !(val & _PAGE_PRESENT);
}
/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
return __pte_needs_invert(val) ? ~0ull : 0;
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
/*
* When a PTE transitions from NONE to !NONE or vice-versa
* invert the PFN part to stop speculation.
* pte_pfn undoes this when needed.
*/
if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
val = (val & ~mask) | (~val & mask);
return val;
}
#endif /* __ASSEMBLY__ */
#endif

@ -185,19 +185,29 @@ static inline int pte_special(pte_t pte)
return pte_flags(pte) & _PAGE_SPECIAL;
}
/* Entries that were set to PROT_NONE are inverted */
static inline u64 protnone_mask(u64 val);
static inline unsigned long pte_pfn(pte_t pte)
{
return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
phys_addr_t pfn = pte_val(pte);
pfn ^= protnone_mask(pfn);
return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}
static inline unsigned long pmd_pfn(pmd_t pmd)
{
return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
phys_addr_t pfn = pmd_val(pmd);
pfn ^= protnone_mask(pfn);
return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
static inline unsigned long pud_pfn(pud_t pud)
{
return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
phys_addr_t pfn = pud_val(pud);
pfn ^= protnone_mask(pfn);
return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
static inline unsigned long p4d_pfn(p4d_t p4d)
@ -400,11 +410,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_RW);
}
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
}
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
@ -459,11 +464,6 @@ static inline pud_t pud_mkwrite(pud_t pud)
return pud_set_flags(pud, _PAGE_RW);
}
static inline pud_t pud_mknotpresent(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
@ -528,25 +528,45 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
massage_pgprot(pgprot));
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PTE_PFN_MASK;
return __pte(pfn | massage_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
massage_pgprot(pgprot));
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PHYSICAL_PMD_PAGE_MASK;
return __pmd(pfn | massage_pgprot(pgprot));
}
static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
massage_pgprot(pgprot));
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PHYSICAL_PUD_PAGE_MASK;
return __pud(pfn | massage_pgprot(pgprot));
}
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
return pfn_pmd(pmd_pfn(pmd),
__pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
static inline pud_t pud_mknotpresent(pud_t pud)
{
return pfn_pud(pud_pfn(pud),
__pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte);
pteval_t val = pte_val(pte), oldval = val;
/*
* Chop off the NX bit (if present), and add the NX portion of
@ -554,17 +574,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
*/
val &= _PAGE_CHG_MASK;
val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmdval_t val = pmd_val(pmd);
pmdval_t val = pmd_val(pmd), oldval = val;
val &= _HPAGE_CHG_MASK;
val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
}
@ -1274,6 +1294,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
return __pte_access_permitted(pud_val(pud), write);
}
#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
static inline bool arch_has_pfn_modify_check(void)
{
return boot_cpu_has_bug(X86_BUG_L1TF);
}
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */

@ -276,7 +276,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
* | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
* | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@ -289,20 +289,34 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
*
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
#define SWP_TYPE_BITS 5
/* Place the offset above the type: */
#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
#define SWP_TYPE_BITS 5
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
& ((1U << SWP_TYPE_BITS) - 1))
#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
((type) << (SWP_TYPE_FIRST_BIT)) \
| ((offset) << SWP_OFFSET_FIRST_BIT) })
/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
/*
* Shift the offset up "too far" by TYPE bits, then down again
* The offset is inverted by a binary not operation to make the high
* physical bits set.
*/
#define __swp_entry(type, offset) ((swp_entry_t) { \
(~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
| ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
@ -346,5 +360,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
return true;
}
#include <asm/pgtable-invert.h>
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */

@ -180,6 +180,11 @@ extern const struct seq_operations cpuinfo_op;
extern void cpu_detect(struct cpuinfo_x86 *c);
static inline unsigned long l1tf_pfn_limit(void)
{
return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
}
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
@ -969,4 +974,16 @@ bool xen_set_default_idle(void);
void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
void microcode_check(void);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
L1TF_MITIGATION_FLUSH_NOWARN,
L1TF_MITIGATION_FLUSH,
L1TF_MITIGATION_FLUSH_NOSMT,
L1TF_MITIGATION_FULL,
L1TF_MITIGATION_FULL_FORCE
};
extern enum l1tf_mitigations l1tf_mitigation;
#endif /* _ASM_X86_PROCESSOR_H */

@ -170,7 +170,6 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
#define smp_num_siblings 1
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;

@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void)
}
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
extern int topology_phys_to_logical_pkg(unsigned int pkg);
int topology_phys_to_logical_pkg(unsigned int pkg);
bool topology_is_primary_thread(unsigned int cpu);
bool topology_smt_supported(void);
#else
#define topology_max_packages() (1)
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
static inline bool topology_smt_supported(void) { return false; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)

@ -571,4 +571,15 @@ enum vm_instruction_error_number {
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
};
enum vmx_l1d_flush_state {
VMENTER_L1D_FLUSH_AUTO,
VMENTER_L1D_FLUSH_NEVER,
VMENTER_L1D_FLUSH_COND,
VMENTER_L1D_FLUSH_ALWAYS,
VMENTER_L1D_FLUSH_EPT_DISABLED,
VMENTER_L1D_FLUSH_NOT_REQUIRED,
};
extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
#endif

@ -34,6 +34,7 @@
#include <linux/dmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/irq.h>
#include <asm/trace/irq_vectors.h>
#include <asm/irq_remapping.h>
@ -56,6 +57,7 @@
#include <asm/hypervisor.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/irq_regs.h>
unsigned int num_processors;
@ -2092,6 +2094,23 @@ static int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
#ifdef CONFIG_SMP
/**
* apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
* @id: APIC ID to check
*/
bool apic_id_is_primary_thread(unsigned int apicid)
{
u32 mask;
if (smp_num_siblings == 1)
return true;
/* Isolate the SMT bit(s) in the APICID and check for 0 */
mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
return !(apicid & mask);
}
#endif
/*
* Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
* and cpuid_to_apicid[] synchronized.

@ -16,6 +16,8 @@
#include <linux/device.h>
#include <linux/pci.h>
#include <linux/htirq.h>
#include <linux/irq.h>
#include <asm/irqdomain.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>

@ -33,6 +33,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>

@ -12,6 +12,7 @@
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/pci.h>
#include <linux/dmar.h>
#include <linux/hpet.h>

@ -11,6 +11,7 @@
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/slab.h>

@ -298,7 +298,6 @@ static int nearby_node(int apicid)
}
#endif
#ifdef CONFIG_SMP
/*
* Fix up cpu_core_id for pre-F17h systems to be in the
* [0 .. cores_per_node - 1] range. Not really needed but
@ -315,6 +314,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
static void amd_get_topology_early(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_TOPOEXT))
smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@ -333,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
node_id = ecx & 0xff;
smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
if (c->x86 == 0x15)
c->cu_id = ebx & 0xff;
@ -376,7 +381,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
legacy_fixup_core_id(c);
}
}
#endif
/*
* On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
@ -384,7 +388,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
@ -396,16 +399,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
#endif
}
u16 amd_get_nb_id(int cpu)
{
u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
return id;
return per_cpu(cpu_llc_id, cpu);
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
@ -579,6 +577,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
static void early_init_amd(struct cpuinfo_x86 *c)
{
u64 value;
u32 dummy;
early_init_amd_mc(c);
@ -668,6 +667,22 @@ static void early_init_amd(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_SME);
}
}
/* Re-enable TopologyExtensions if switched off by BIOS */
if (c->x86 == 0x15 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
amd_get_topology_early(c);
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@ -759,19 +774,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
@ -835,15 +837,8 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
if (c->extended_cpuid_level >= 0x80000008) {
amd_detect_cmp(c);
srat_detect_node(c);
}
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
amd_detect_cmp(c);
srat_detect_node(c);
init_amd_cacheinfo(c);

@ -22,14 +22,17 @@
#include <asm/processor-flags.h>
#include <asm/fpu/internal.h>
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
/*
* Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@ -55,6 +58,12 @@ void __init check_bugs(void)
{
identify_boot_cpu();
/*
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
cpu_smt_check_topology_early();
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
@ -81,6 +90,8 @@ void __init check_bugs(void)
*/
ssb_select_mitigation();
l1tf_select_mitigation();
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@ -311,23 +322,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
/* Check for Skylake-like CPUs (for RSB handling) */
static bool __init is_skylake_era(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6) {
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
return true;
}
}
return false;
}
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@ -388,22 +382,15 @@ retpoline_auto:
pr_info("%s\n", spectre_v2_strings[mode]);
/*
* If neither SMEP nor PTI are available, there is a risk of
* hitting userspace addresses in the RSB after a context switch
* from a shallow call stack to a deeper one. To prevent this fill
* the entire RSB, even when using IBRS.
* If spectre v2 protection has been enabled, unconditionally fill
* RSB during a context switch; this protects against two independent
* issues:
*
* Skylake era CPUs have a separate issue with *underflow* of the
* RSB, when they will predict 'ret' targets from the generic BTB.
* The proper mitigation for this is IBRS. If IBRS is not supported
* or deactivated in favour of retpolines the RSB fill on context
* switch is required.
* - RSB underflow (and switch to BTB) on Skylake+
* - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
*/
if ((!boot_cpu_has(X86_FEATURE_PTI) &&
!boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
}
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
@ -654,8 +641,121 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
#endif
static void __init l1tf_select_mitigation(void)
{
u64 half_pa;
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
break;
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
cpu_smt_disable(false);
break;
case L1TF_MITIGATION_FULL_FORCE:
cpu_smt_disable(true);
break;
}
#if CONFIG_PGTABLE_LEVELS == 2
pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
return;
#endif
/*
* This is extremely unlikely to happen because almost all
* systems have far more MAX_PA/2 than RAM can be fit into
* DIMM slots.
*/
half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
return;
}
setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
}
static int __init l1tf_cmdline(char *str)
{
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return 0;
if (!str)
return -EINVAL;
if (!strcmp(str, "off"))
l1tf_mitigation = L1TF_MITIGATION_OFF;
else if (!strcmp(str, "flush,nowarn"))
l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
else if (!strcmp(str, "flush"))
l1tf_mitigation = L1TF_MITIGATION_FLUSH;
else if (!strcmp(str, "flush,nosmt"))
l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
else if (!strcmp(str, "full"))
l1tf_mitigation = L1TF_MITIGATION_FULL;
else if (!strcmp(str, "full,force"))
l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
return 0;
}
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
#if IS_ENABLED(CONFIG_KVM_INTEL)
static const char *l1tf_vmx_states[] = {
[VMENTER_L1D_FLUSH_AUTO] = "auto",
[VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
[VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
[VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
[VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
};
static ssize_t l1tf_show_state(char *buf)
{
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
cpu_smt_control == CPU_SMT_ENABLED))
return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation]);
return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation],
cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
#endif
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@ -681,6 +781,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
case X86_BUG_L1TF:
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
return l1tf_show_state(buf);
break;
default:
break;
}
@ -707,4 +811,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
}
ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
{
return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
}
#endif

@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@ -614,33 +621,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
int detect_ht_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
static bool printed;
if (!cpu_has(c, X86_FEATURE_HT))
return;
return -1;
if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
goto out;
return -1;
if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
return;
return -1;
cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
if (smp_num_siblings == 1) {
if (smp_num_siblings == 1)
pr_info_once("CPU0: Hyper-Threading is disabled\n");
goto out;
}
#endif
return 0;
}
if (smp_num_siblings <= 1)
goto out;
void detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
int index_msb, core_bits;
if (detect_ht_early(c) < 0)
return;
index_msb = get_count_order(smp_num_siblings);
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@ -653,15 +663,6 @@ void detect_ht(struct cpuinfo_x86 *c)
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
((1 << core_bits) - 1);
out:
if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
pr_info("CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
pr_info("CPU: Processor Core ID: %d\n",
c->cpu_core_id);
printed = 1;
}
#endif
}
@ -933,6 +934,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
{}
};
static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
/* in addition to cpu_no_speculation */
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
{}
};
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
@ -958,6 +974,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
if (x86_match_cpu(cpu_no_l1tf))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
}
/*

@ -47,6 +47,8 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);

@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
/*
* Get the number of SMT siblings early from the extended topology
* leaf, if available. Otherwise try the legacy SMT detection.
*/
if (detect_extended_topology_early(c) < 0)
detect_ht_early(c);
}
#ifdef CONFIG_X86_32

@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev;
static int check_online_cpus(void)
{
if (num_online_cpus() == num_present_cpus())
return 0;
unsigned int cpu;
pr_err("Not all CPUs online, aborting microcode update.\n");
/*
* Make sure all CPUs are online. It's fine for SMT to be disabled if
* all the primary threads are still online.
*/
for_each_present_cpu(cpu) {
if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
pr_err("Not all CPUs online, aborting microcode update.\n");
return -EINVAL;
}
}
return -EINVAL;
return 0;
}
static atomic_t late_cpus_in;

@ -27,16 +27,13 @@
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
void detect_extended_topology(struct cpuinfo_x86 *c)
int detect_extended_topology_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
static bool printed;
unsigned int eax, ebx, ecx, edx;
if (c->cpuid_level < 0xb)
return;
return -1;
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
@ -44,7 +41,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
* check if the cpuid leaf 0xb is actually implemented.
*/
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
return;
return -1;
set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
@ -52,10 +49,30 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
#endif
return 0;
}
/*
* Check for extended topology enumeration cpuid leaf 0xb and if it
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
void detect_extended_topology(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
if (detect_extended_topology_early(c) < 0)
return;
/*
* Populate HT related information from sub-leaf level 0.
*/
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@ -86,15 +103,5 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
if (!printed) {
pr_info("CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
if (c->x86_max_cores > 1)
pr_info("CPU: Processor Core ID: %d\n",
c->cpu_core_id);
printed = 1;
}
return;
#endif
}

@ -10,6 +10,7 @@
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>

@ -27,6 +27,7 @@
#include <asm/set_memory.h>
#include <asm/kprobes.h>
#include <asm/sections.h>
#include <asm/ftrace.h>
#include <asm/nops.h>

@ -1,6 +1,7 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/errno.h>

@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/init.h>

@ -8,6 +8,7 @@
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
#include <asm/hw_irq.h>
struct idt_data {
unsigned int vector;

@ -10,6 +10,7 @@
#include <linux/ftrace.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/io_apic.h>

@ -11,6 +11,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>

@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/delay.h>
#include <linux/ftrace.h>

@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/kprobes.h>

@ -63,6 +63,7 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
#include <asm/sections.h>
#include "common.h"
@ -394,8 +395,6 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
- (u8 *) dest;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
src, dest, insn->displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(insn);
@ -621,8 +620,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
* Raise a BUG or we'll continue in an endless reentering loop
* and eventually a stack overflow.
*/
printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
p->addr);
pr_err("Unrecoverable kprobe detected.\n");
dump_kprobe(p);
BUG();
default:

@ -88,10 +88,12 @@ unsigned paravirt_patch_call(void *insnbuf,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (tgt_clobbers & ~site_clobbers)
return len; /* target would clobber too much for this site */
if (len < 5)
if (len < 5) {
#ifdef CONFIG_RETPOLINE
WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
b->opcode = 0xe8; /* call */
b->delta = delta;
@ -106,8 +108,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (len < 5)
if (len < 5) {
#ifdef CONFIG_RETPOLINE
WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
b->opcode = 0xe9; /* jmp */
b->delta = delta;

@ -852,6 +852,12 @@ void __init setup_arch(char **cmdline_p)
memblock_reserve(__pa_symbol(_text),
(unsigned long)__bss_stop - (unsigned long)_text);
/*
* Make sure page 0 is always reserved because on systems with
* L1TF its contents can be leaked to user processes.
*/
memblock_reserve(0, PAGE_SIZE);
early_reserve_initrd();
/*

@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
kvm_set_cpu_l1tf_flush_l1d();
if (trace_resched_ipi_enabled()) {
/*

@ -78,13 +78,7 @@
#include <asm/realmode.h>
#include <asm/misc.h>
#include <asm/spec-ctrl.h>
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
#include <asm/hw_irq.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@ -311,6 +305,23 @@ found:
return 0;
}
/**
* topology_is_primary_thread - Check whether CPU is the primary SMT thread
* @cpu: CPU to check
*/
bool topology_is_primary_thread(unsigned int cpu)
{
return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
}
/**
* topology_smt_supported - Check whether SMT is supported by the CPUs
*/
bool topology_smt_supported(void)
{
return smp_num_siblings > 1;
}
/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
*

@ -12,6 +12,7 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>

@ -3825,6 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
{
int r = 1;
vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
trace_kvm_page_fault(fault_address, error_code);

@ -175,6 +175,8 @@ struct vcpu_svm {
uint64_t sysenter_eip;
uint64_t tsc_aux;
u64 msr_decfg;
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
@ -1616,6 +1618,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
u32 dummy;
u32 eax = 1;
vcpu->arch.microcode_version = 0x01000065;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
@ -3555,6 +3558,22 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
static int svm_get_msr_feature(struct kvm_msr_entry *msr)
{
msr->data = 0;
switch (msr->index) {
case MSR_F10H_DECFG:
if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
break;
default:
return 1;
}
return 0;
}
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
@ -3637,9 +3656,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->virt_spec_ctrl;
break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
case MSR_F15H_IC_CFG: {
int family, model;
@ -3657,6 +3673,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0x1E;
}
break;
case MSR_F10H_DECFG:
msr_info->data = svm->msr_decfg;
break;
default:
return kvm_get_msr_common(vcpu, msr_info);
}
@ -3845,6 +3864,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
case MSR_F10H_DECFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
if (svm_get_msr_feature(&msr_entry))
return 1;
/* Check the supported bits */
if (data & ~msr_entry.data)
return 1;
/* Don't allow the guest to change a bit, #GP */
if (!msr->host_initiated && (data ^ msr_entry.data))
return 1;
svm->msr_decfg = data;
break;
}
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
@ -5588,6 +5625,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.vcpu_unblocking = svm_vcpu_unblocking,
.update_bp_intercept = update_bp_intercept,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,

@ -191,6 +191,150 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return;
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
static const struct {
const char *option;
enum vmx_l1d_flush_state cmd;
} vmentry_l1d_param[] = {
{"auto", VMENTER_L1D_FLUSH_AUTO},
{"never", VMENTER_L1D_FLUSH_NEVER},
{"cond", VMENTER_L1D_FLUSH_COND},
{"always", VMENTER_L1D_FLUSH_ALWAYS},
};
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
}
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
u64 msr;
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
}
/* If set to auto use the default l1tf mitigation method */
if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
l1tf = VMENTER_L1D_FLUSH_COND;
break;
case L1TF_MITIGATION_FULL:
case L1TF_MITIGATION_FULL_FORCE:
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
break;
}
} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
vmx_l1d_flush_pages = page_address(page);
/*
* Initialize each page with a different pattern in
* order to protect against KSM in the nested
* virtualization case.
*/
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
}
l1tf_vmx_mitigation = l1tf;
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
static_branch_enable(&vmx_l1d_should_flush);
else
static_branch_disable(&vmx_l1d_should_flush);
if (l1tf == VMENTER_L1D_FLUSH_COND)
static_branch_enable(&vmx_l1d_flush_cond);
else
static_branch_disable(&vmx_l1d_flush_cond);
return 0;
}
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
if (s) {
for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
if (sysfs_streq(s, vmentry_l1d_param[i].option))
return vmentry_l1d_param[i].cmd;
}
}
return -EINVAL;
}
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
int l1tf, ret;
if (!boot_cpu_has(X86_BUG_L1TF))
return 0;
l1tf = vmentry_l1d_flush_parse(s);
if (l1tf < 0)
return l1tf;
/*
* Has vmx_init() run already? If not then this is the pre init
* parameter parsing. In that case just store the value and let
* vmx_init() do the proper setup after enable_ept has been
* established.
*/
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
vmentry_l1d_flush_param = l1tf;
return 0;
}
mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
#define NR_AUTOLOAD_MSRS 8
struct vmcs {
@ -567,6 +711,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
struct vmx_msrs {
unsigned int nr;
struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
};
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@ -600,9 +749,8 @@ struct vcpu_vmx {
struct loaded_vmcs *loaded_vmcs;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
unsigned nr;
struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
struct vmx_msrs guest;
struct vmx_msrs host;
} msr_autoload;
struct {
int loaded;
@ -1967,9 +2115,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
static int find_msr(struct vmx_msrs *m, unsigned int msr)
{
unsigned int i;
for (i = 0; i < m->nr; ++i) {
if (m->val[i].index == msr)
return i;
}
return -ENOENT;
}
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
unsigned i;
int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@ -1990,18 +2149,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
i = find_msr(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
m->guest.val[i] = m->guest.val[m->guest.nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
if (i == m->nr)
skip_guest:
i = find_msr(&m->host, msr);
if (i < 0)
return;
--m->nr;
m->guest[i] = m->guest[m->nr];
m->host[i] = m->host[m->nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
--m->host.nr;
m->host.val[i] = m->host.val[m->host.nr];
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@ -2016,9 +2178,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val)
u64 guest_val, u64 host_val, bool entry_only)
{
unsigned i;
int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@ -2053,24 +2215,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
i = find_msr(&m->guest, msr);
if (!entry_only)
j = find_msr(&m->host, msr);
if (i == NR_AUTOLOAD_MSRS) {
if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
} else if (i == m->nr) {
++m->nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
if (i < 0) {
i = m->guest.nr++;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
}
m->guest.val[i].index = msr;
m->guest.val[i].value = guest_val;
m->guest[i].index = msr;
m->guest[i].value = guest_val;
m->host[i].index = msr;
m->host[i].value = host_val;
if (entry_only)
return;
if (j < 0) {
j = m->host.nr++;
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
m->host.val[j].index = msr;
m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@ -2114,7 +2283,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer);
guest_efer, host_efer, false);
return false;
} else {
guest_efer &= ~ignore_bits;
@ -3266,6 +3435,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
return !(val & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
return 1;
}
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@ -3523,7 +3697,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
vcpu->arch.ia32_xss, host_xss);
vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
@ -5714,9 +5888,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@ -5736,8 +5910,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
vmx->arch_capabilities = kvm_get_arch_capabilities();
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
@ -5770,6 +5943,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
vcpu->arch.microcode_version = 0x100000000ULL;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@ -8987,6 +9161,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
/*
* Software based L1D cache flush which is used when microcode providing
* the cache control MSR is not loaded.
*
* The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
* flush it is required to read in 64 KiB because the replacement algorithm
* is not exactly LRU. This could be sized at runtime via topology
* information but as all relevant affected CPUs have 32KiB L1D cache size
* there is no point in doing so.
*/
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;
/*
* This code is only executed when the the flush mode is 'cond' or
* 'always'
*/
if (static_branch_likely(&vmx_l1d_flush_cond)) {
bool flush_l1d;
/*
* Clear the per-vcpu flush bit, it gets set again
* either from vcpu_run() or from one of the unsafe
* VMEXIT handlers.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
/*
* Clear the per-cpu flush bit, it gets set again from
* the interrupt handlers.
*/
flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
kvm_clear_cpu_l1tf_flush_l1d();
if (!flush_l1d)
return;
}
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
asm volatile(
/* First ensure the pages are in the TLB */
"xorl %%eax, %%eax\n"
".Lpopulate_tlb:\n\t"
"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
"addl $4096, %%eax\n\t"
"cmpl %%eax, %[size]\n\t"
"jne .Lpopulate_tlb\n\t"
"xorl %%eax, %%eax\n\t"
"cpuid\n\t"
/* Now fill the cache */
"xorl %%eax, %%eax\n"
".Lfill_cache:\n"
"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
"addl $64, %%eax\n\t"
"cmpl %%eax, %[size]\n\t"
"jne .Lfill_cache\n\t"
"lfence\n"
:: [flush_pages] "r" (vmx_l1d_flush_pages),
[size] "r" (size)
: "eax", "ebx", "ecx", "edx");
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@ -9390,7 +9637,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, msrs[i].msr);
else
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
msrs[i].host);
msrs[i].host, false);
}
static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@ -9483,6 +9730,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@ -9835,6 +10085,37 @@ free_vcpu:
return ERR_PTR(err);
}
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
static int vmx_vm_init(struct kvm *kvm)
{
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
/*
* Warn upon starting the first VM in a potentially
* insecure environment.
*/
if (cpu_smt_control == CPU_SMT_ENABLED)
pr_warn_once(L1TF_MSG_SMT);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D);
break;
case L1TF_MITIGATION_FULL_FORCE:
/* Flush is enforced */
break;
}
}
return 0;
}
static void __init vmx_check_processor_compat(void *rtn)
{
struct vmcs_config vmcs_conf;
@ -10774,10 +11055,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Set the MSR load/store lists to match L0's settings.
*/
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
/*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@ -11202,6 +11483,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (ret)
return ret;
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
/*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
@ -11712,8 +11996,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@ -12225,6 +12509,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_accelerated_tpr = report_flexpriority,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_init = vmx_vm_init,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
.vcpu_reset = vmx_vcpu_reset,
@ -12234,6 +12520,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.vcpu_put = vmx_vcpu_put,
.update_bp_intercept = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@ -12341,22 +12628,18 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.setup_mce = vmx_setup_mce,
};
static int __init vmx_init(void)
static void vmx_cleanup_l1d_flush(void)
{
int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
return 0;
if (vmx_l1d_flush_pages) {
free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
vmx_l1d_flush_pages = NULL;
}
/* Restore state so sysfs ignores VMX */
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
static void __exit vmx_exit(void)
static void vmx_exit(void)
{
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
@ -12364,7 +12647,40 @@ static void __exit vmx_exit(void)
#endif
kvm_exit();
vmx_cleanup_l1d_flush();
}
module_exit(vmx_exit)
static int __init vmx_init(void)
{
int r;
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
/*
* Must be called after kvm_init() so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
if (boot_cpu_has(X86_BUG_L1TF)) {
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
if (r) {
vmx_exit();
return r;
}
}
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
return 0;
}
module_init(vmx_init)
module_exit(vmx_exit)

@ -181,6 +181,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
{ "req_event", VCPU_STAT(req_event) },
{ "l1d_flush", VCPU_STAT(l1d_flush) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@ -1041,6 +1042,71 @@ static u32 emulated_msrs[] = {
static unsigned num_emulated_msrs;
/*
* List of msr numbers which are used to expose MSR-based features that
* can be used by a hypervisor to validate requested CPU features.
*/
static u32 msr_based_features[] = {
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
};
static unsigned int num_msr_based_features;
u64 kvm_get_arch_capabilities(void)
{
u64 data;
rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
/*
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
* (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
* capability to the guest too, and if EPT is disabled we're not
* vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
* require a nested hypervisor to do a flush of its own.
*/
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
return data;
}
EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case MSR_IA32_ARCH_CAPABILITIES:
msr->data = kvm_get_arch_capabilities();
break;
case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
default:
if (kvm_x86_ops->get_msr_feature(msr))
return 1;
}
return 0;
}
static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct kvm_msr_entry msr;
int r;
msr.index = index;
r = kvm_get_msr_feature(&msr);
if (r)
return r;
*data = msr.data;
return 0;
}
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits)
@ -2156,7 +2222,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_AMD64_NB_CFG:
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
case MSR_VM_HSAVE_PA:
case MSR_AMD64_PATCH_LOADER:
@ -2164,6 +2229,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_DC_CFG:
break;
case MSR_IA32_UCODE_REV:
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@ -2450,7 +2519,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x100000000ULL;
msr_info->data = vcpu->arch.microcode_version;
break;
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
@ -2600,13 +2669,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
int i, idx;
int i;
idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return i;
}
@ -2705,6 +2772,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_GET_MSR_FEATURES:
r = 1;
break;
case KVM_CAP_ADJUST_CLOCK:
@ -2819,6 +2887,31 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
r = 0;
break;
case KVM_GET_MSR_FEATURE_INDEX_LIST: {
struct kvm_msr_list __user *user_msr_list = argp;
struct kvm_msr_list msr_list;
unsigned int n;
r = -EFAULT;
if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
goto out;
n = msr_list.nmsrs;
msr_list.nmsrs = num_msr_based_features;
if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
goto out;
r = -E2BIG;
if (n < msr_list.nmsrs)
goto out;
r = -EFAULT;
if (copy_to_user(user_msr_list->indices, &msr_based_features,
num_msr_based_features * sizeof(u32)))
goto out;
r = 0;
break;
}
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
}
default:
r = -EINVAL;
@ -3553,12 +3646,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
case KVM_GET_MSRS:
case KVM_GET_MSRS: {
int idx = srcu_read_lock(&vcpu->kvm->srcu);
r = msr_io(vcpu, argp, do_get_msr, 1);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
case KVM_SET_MSRS:
}
case KVM_SET_MSRS: {
int idx = srcu_read_lock(&vcpu->kvm->srcu);
r = msr_io(vcpu, argp, do_set_msr, 0);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_TPR_ACCESS_REPORTING: {
struct kvm_tpr_access_ctl tac;
@ -4333,6 +4432,19 @@ static void kvm_init_msr_list(void)
j++;
}
num_emulated_msrs = j;
for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
struct kvm_msr_entry msr;
msr.index = msr_based_features[i];
if (kvm_get_msr_feature(&msr))
continue;
if (j < i)
msr_based_features[j] = msr_based_features[i];
j++;
}
num_msr_based_features = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@ -4573,6 +4685,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@ -5701,6 +5816,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
bool writeback = true;
bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.l1tf_flush_l1d = true;
/*
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
@ -7146,6 +7263,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
if (kvm_vcpu_running(vcpu)) {
@ -8153,6 +8271,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
vcpu->arch.l1tf_flush_l1d = true;
kvm_x86_ops->sched_in(vcpu, cpu);
}

@ -24,6 +24,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/sections.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

@ -4,6 +4,8 @@
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
#include <linux/swapfile.h>
#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@ -880,3 +882,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
#ifdef CONFIG_SWAP
unsigned long max_swapfile_size(void)
{
unsigned long pages;
pages = generic_max_swapfile_size();
if (boot_cpu_has_bug(X86_BUG_L1TF)) {
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
/*
* We encode swap offsets also with 3 bits below those for pfn
* which makes the usable limit higher.
*/
#if CONFIG_PGTABLE_LEVELS > 2
l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
#endif
pages = min_t(unsigned long, l1tf_limit, pages);
}
return pages;
}
#endif

@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pmd(pmd, __pmd(v));
*old = v;
new_pmd = pmd_mknotpresent(*pmd);
} else {
/* Presume this has been called with clear==true previously */
new_pmd = __pmd(*old);
}
set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pte_atomic(pte, __pte(v));
*old = v;
/* Nothing should care about address */
pte_clear(&init_mm, 0, pte);
} else {
/* Presume this has been called with clear==true previously */
set_pte_atomic(pte, __pte(*old));
}
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)

@ -174,3 +174,24 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return "[mpx]";
return NULL;
}
/*
* Only allow root to set high MMIO mappings to PROT_NONE.
* This prevents an unpriv. user to set them to PROT_NONE and invert
* them, then pointing to valid memory for L1TF speculation.
*
* Note: for locked down kernels may want to disable the root override.
*/
bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return true;
if (!__pte_needs_invert(pgprot_val(prot)))
return true;
/* If it's real memory always allow */
if (pfn_valid(pfn))
return true;
if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
return false;
return true;
}

@ -1006,8 +1006,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pmd_pgprot)));
set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@ -1079,8 +1079,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pud_pgprot)));
set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;

@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt

@ -18,6 +18,7 @@
#include <asm/intel-mid.h>
#include <asm/intel_scu_ipc.h>
#include <asm/io_apic.h>
#include <asm/hw_irq.h>
#define TANGIER_EXT_TIMER0_MSI 12

@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
struct msg_desc msgdesc;
ack_APIC_irq();
kvm_set_cpu_l1tf_flush_l1d();
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());

@ -3,6 +3,7 @@
#endif
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/slab.h>
#include <xen/features.h>
#include <xen/page.h>

@ -527,16 +527,24 @@ ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
return sprintf(buf, "Not affected\n");
}
ssize_t __weak cpu_show_l1tf(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "Not affected\n");
}
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
&dev_attr_spectre_v1.attr,
&dev_attr_spectre_v2.attr,
&dev_attr_spec_store_bypass.attr,
&dev_attr_l1tf.attr,
NULL
};

@ -115,12 +115,12 @@ static inline struct sk_buff *hci_uart_dequeue(struct hci_uart *hu)
struct sk_buff *skb = hu->tx_skb;
if (!skb) {
read_lock(&hu->proto_lock);
percpu_down_read(&hu->proto_lock);
if (test_bit(HCI_UART_PROTO_READY, &hu->flags))
skb = hu->proto->dequeue(hu);
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
} else {
hu->tx_skb = NULL;
}
@ -130,7 +130,14 @@ static inline struct sk_buff *hci_uart_dequeue(struct hci_uart *hu)
int hci_uart_tx_wakeup(struct hci_uart *hu)
{
read_lock(&hu->proto_lock);
/* This may be called in an IRQ context, so we can't sleep. Therefore
* we try to acquire the lock only, and if that fails we assume the
* tty is being closed because that is the only time the write lock is
* acquired. If, however, at some point in the future the write lock
* is also acquired in other situations, then this must be revisited.
*/
if (!percpu_down_read_trylock(&hu->proto_lock))
return 0;
if (!test_bit(HCI_UART_PROTO_READY, &hu->flags))
goto no_schedule;
@ -145,7 +152,7 @@ int hci_uart_tx_wakeup(struct hci_uart *hu)
schedule_work(&hu->write_work);
no_schedule:
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
return 0;
}
@ -247,12 +254,12 @@ static int hci_uart_flush(struct hci_dev *hdev)
tty_ldisc_flush(tty);
tty_driver_flush_buffer(tty);
read_lock(&hu->proto_lock);
percpu_down_read(&hu->proto_lock);
if (test_bit(HCI_UART_PROTO_READY, &hu->flags))
hu->proto->flush(hu);
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
return 0;
}
@ -275,15 +282,15 @@ static int hci_uart_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
BT_DBG("%s: type %d len %d", hdev->name, hci_skb_pkt_type(skb),
skb->len);
read_lock(&hu->proto_lock);
percpu_down_read(&hu->proto_lock);
if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) {
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
return -EUNATCH;
}
hu->proto->enqueue(hu, skb);
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
hci_uart_tx_wakeup(hu);
@ -486,7 +493,7 @@ static int hci_uart_tty_open(struct tty_struct *tty)
INIT_WORK(&hu->init_ready, hci_uart_init_work);
INIT_WORK(&hu->write_work, hci_uart_write_work);
rwlock_init(&hu->proto_lock);
percpu_init_rwsem(&hu->proto_lock);
/* Flush any pending characters in the driver */
tty_driver_flush_buffer(tty);
@ -503,7 +510,6 @@ static void hci_uart_tty_close(struct tty_struct *tty)
{
struct hci_uart *hu = tty->disc_data;
struct hci_dev *hdev;
unsigned long flags;
BT_DBG("tty %p", tty);
@ -518,9 +524,9 @@ static void hci_uart_tty_close(struct tty_struct *tty)
hci_uart_close(hdev);
if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) {
write_lock_irqsave(&hu->proto_lock, flags);
percpu_down_write(&hu->proto_lock);
clear_bit(HCI_UART_PROTO_READY, &hu->flags);
write_unlock_irqrestore(&hu->proto_lock, flags);
percpu_up_write(&hu->proto_lock);
cancel_work_sync(&hu->write_work);
@ -582,10 +588,10 @@ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data,
if (!hu || tty != hu->tty)
return;
read_lock(&hu->proto_lock);
percpu_down_read(&hu->proto_lock);
if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) {
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
return;
}
@ -593,7 +599,7 @@ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data,
* tty caller
*/
hu->proto->recv(hu, data, count);
read_unlock(&hu->proto_lock);
percpu_up_read(&hu->proto_lock);
if (hu->hdev)
hu->hdev->stat.byte_rx += count;

@ -304,6 +304,7 @@ int hci_uart_register_device(struct hci_uart *hu,
hci_set_drvdata(hdev, hu);
INIT_WORK(&hu->write_work, hci_uart_write_work);
percpu_init_rwsem(&hu->proto_lock);
/* Only when vendor specific setup callback is provided, consider
* the manufacturer information valid. This avoids filling in the

@ -87,7 +87,7 @@ struct hci_uart {
struct work_struct write_work;
const struct hci_uart_proto *proto;
rwlock_t proto_lock; /* Stop work for proto close */
struct percpu_rw_semaphore proto_lock; /* Stop work for proto close */
void *priv;
struct sk_buff *tx_skb;

@ -62,6 +62,7 @@
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>

@ -2544,6 +2544,9 @@ static int qcom_nand_host_init(struct qcom_nand_controller *nandc,
nand_set_flash_node(chip, dn);
mtd->name = devm_kasprintf(dev, GFP_KERNEL, "qcom_nand.%d", host->cs);
if (!mtd->name)
return -ENOMEM;
mtd->owner = THIS_MODULE;
mtd->dev.parent = dev;

@ -894,7 +894,6 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
struct sk_buff *skb,
struct sk_buff_head *list)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
RING_IDX cons = queue->rx.rsp_cons;
struct sk_buff *nskb;
@ -903,15 +902,16 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
RING_GET_RESPONSE(&queue->rx, ++cons);
skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
if (shinfo->nr_frags == MAX_SKB_FRAGS) {
if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
BUG_ON(pull_to <= skb_headlen(skb));
__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
}
BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
skb_frag_page(nfrag),
rx->offset, rx->status, PAGE_SIZE);
skb_shinfo(nskb)->nr_frags = 0;

@ -53,6 +53,8 @@
#include <linux/delay.h>
#include <linux/semaphore.h>
#include <linux/irqdomain.h>
#include <linux/irq.h>
#include <asm/irqdomain.h>
#include <asm/apic.h>
#include <linux/msi.h>

@ -438,9 +438,9 @@ static void u2_phy_instance_init(struct mtk_tphy *tphy,
u32 index = instance->index;
u32 tmp;
/* switch to USB function. (system register, force ip into usb mode) */
/* switch to USB function, and enable usb pll */
tmp = readl(com + U3P_U2PHYDTM0);
tmp &= ~P2C_FORCE_UART_EN;
tmp &= ~(P2C_FORCE_UART_EN | P2C_FORCE_SUSPENDM);
tmp |= P2C_RG_XCVRSEL_VAL(1) | P2C_RG_DATAIN_VAL(0);
writel(tmp, com + U3P_U2PHYDTM0);
@ -500,10 +500,8 @@ static void u2_phy_instance_power_on(struct mtk_tphy *tphy,
u32 index = instance->index;
u32 tmp;
/* (force_suspendm=0) (let suspendm=1, enable usb 480MHz pll) */
tmp = readl(com + U3P_U2PHYDTM0);
tmp &= ~(P2C_FORCE_SUSPENDM | P2C_RG_XCVRSEL);
tmp &= ~(P2C_RG_DATAIN | P2C_DTM0_PART_MASK);
tmp &= ~(P2C_RG_XCVRSEL | P2C_RG_DATAIN | P2C_DTM0_PART_MASK);
writel(tmp, com + U3P_U2PHYDTM0);
/* OTG Enable */
@ -538,7 +536,6 @@ static void u2_phy_instance_power_off(struct mtk_tphy *tphy,
tmp = readl(com + U3P_U2PHYDTM0);
tmp &= ~(P2C_RG_XCVRSEL | P2C_RG_DATAIN);
tmp |= P2C_FORCE_SUSPENDM;
writel(tmp, com + U3P_U2PHYDTM0);
/* OTG Disable */
@ -546,18 +543,16 @@ static void u2_phy_instance_power_off(struct mtk_tphy *tphy,
tmp &= ~PA6_RG_U2_OTG_VBUSCMP_EN;
writel(tmp, com + U3P_USBPHYACR6);
/* let suspendm=0, set utmi into analog power down */
tmp = readl(com + U3P_U2PHYDTM0);
tmp &= ~P2C_RG_SUSPENDM;
writel(tmp, com + U3P_U2PHYDTM0);
udelay(1);
tmp = readl(com + U3P_U2PHYDTM1);
tmp &= ~(P2C_RG_VBUSVALID | P2C_RG_AVALID);
tmp |= P2C_RG_SESSEND;
writel(tmp, com + U3P_U2PHYDTM1);
if (tphy->pdata->avoid_rx_sen_degradation && index) {
tmp = readl(com + U3P_U2PHYDTM0);
tmp &= ~(P2C_RG_SUSPENDM | P2C_FORCE_SUSPENDM);
writel(tmp, com + U3P_U2PHYDTM0);
tmp = readl(com + U3D_U2PHYDCR0);
tmp &= ~P2C_RG_SIF_U2PLL_FORCE_ON;
writel(tmp, com + U3D_U2PHYDCR0);

@ -474,6 +474,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->dma_boundary = 0xffffffff;
shost->use_blk_mq = scsi_use_blk_mq;
shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
device_initialize(&shost->shost_gendev);
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);

@ -1040,11 +1040,7 @@ static void set_performant_mode(struct ctlr_info *h, struct CommandList *c,
c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
if (unlikely(!h->msix_vectors))
return;
if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
c->Header.ReplyQueue =
raw_smp_processor_id() % h->nreply_queues;
else
c->Header.ReplyQueue = reply_queue % h->nreply_queues;
c->Header.ReplyQueue = reply_queue;
}
}
@ -1058,10 +1054,7 @@ static void set_ioaccel1_performant_mode(struct ctlr_info *h,
* Tell the controller to post the reply to the queue for this
* processor. This seems to give the best I/O throughput.
*/
if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
cp->ReplyQueue = smp_processor_id() % h->nreply_queues;
else
cp->ReplyQueue = reply_queue % h->nreply_queues;
cp->ReplyQueue = reply_queue;
/*
* Set the bits in the address sent down to include:
* - performant mode bit (bit 0)
@ -1082,10 +1075,7 @@ static void set_ioaccel2_tmf_performant_mode(struct ctlr_info *h,
/* Tell the controller to post the reply to the queue for this
* processor. This seems to give the best I/O throughput.
*/
if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
cp->reply_queue = smp_processor_id() % h->nreply_queues;
else
cp->reply_queue = reply_queue % h->nreply_queues;
cp->reply_queue = reply_queue;
/* Set the bits in the address sent down to include:
* - performant mode bit not used in ioaccel mode 2
* - pull count (bits 0-3)
@ -1104,10 +1094,7 @@ static void set_ioaccel2_performant_mode(struct ctlr_info *h,
* Tell the controller to post the reply to the queue for this
* processor. This seems to give the best I/O throughput.
*/
if (likely(reply_queue == DEFAULT_REPLY_QUEUE))
cp->reply_queue = smp_processor_id() % h->nreply_queues;
else
cp->reply_queue = reply_queue % h->nreply_queues;
cp->reply_queue = reply_queue;
/*
* Set the bits in the address sent down to include:
* - performant mode bit not used in ioaccel mode 2
@ -1152,6 +1139,8 @@ static void __enqueue_cmd_and_start_io(struct ctlr_info *h,
{
dial_down_lockup_detection_during_fw_flash(h, c);
atomic_inc(&h->commands_outstanding);
reply_queue = h->reply_map[raw_smp_processor_id()];
switch (c->cmd_type) {
case CMD_IOACCEL1:
set_ioaccel1_performant_mode(h, c, reply_queue);
@ -7244,6 +7233,26 @@ static void hpsa_disable_interrupt_mode(struct ctlr_info *h)
h->msix_vectors = 0;
}
static void hpsa_setup_reply_map(struct ctlr_info *h)
{
const struct cpumask *mask;
unsigned int queue, cpu;
for (queue = 0; queue < h->msix_vectors; queue++) {
mask = pci_irq_get_affinity(h->pdev, queue);
if (!mask)
goto fallback;
for_each_cpu(cpu, mask)
h->reply_map[cpu] = queue;
}
return;
fallback:
for_each_possible_cpu(cpu)
h->reply_map[cpu] = 0;
}
/* If MSI/MSI-X is supported by the kernel we will try to enable it on
* controllers that are capable. If not, we use legacy INTx mode.
*/
@ -7639,6 +7648,10 @@ static int hpsa_pci_init(struct ctlr_info *h)
err = hpsa_interrupt_mode(h);
if (err)
goto clean1;
/* setup mapping between CPU and reply queue */
hpsa_setup_reply_map(h);
err = hpsa_pci_find_memory_BAR(h->pdev, &h->paddr);
if (err)
goto clean2; /* intmode+region, pci */
@ -8284,6 +8297,28 @@ static struct workqueue_struct *hpsa_create_controller_wq(struct ctlr_info *h,
return wq;
}
static void hpda_free_ctlr_info(struct ctlr_info *h)
{
kfree(h->reply_map);
kfree(h);
}
static struct ctlr_info *hpda_alloc_ctlr_info(void)
{
struct ctlr_info *h;
h = kzalloc(sizeof(*h), GFP_KERNEL);
if (!h)
return NULL;
h->reply_map = kzalloc(sizeof(*h->reply_map) * nr_cpu_ids, GFP_KERNEL);
if (!h->reply_map) {
kfree(h);
return NULL;
}
return h;
}
static int hpsa_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
int dac, rc;
@ -8321,7 +8356,7 @@ reinit_after_soft_reset:
* the driver. See comments in hpsa.h for more info.
*/
BUILD_BUG_ON(sizeof(struct CommandList) % COMMANDLIST_ALIGNMENT);
h = kzalloc(sizeof(*h), GFP_KERNEL);
h = hpda_alloc_ctlr_info();
if (!h) {
dev_err(&pdev->dev, "Failed to allocate controller head\n");
return -ENOMEM;
@ -8726,7 +8761,7 @@ static void hpsa_remove_one(struct pci_dev *pdev)
h->lockup_detected = NULL; /* init_one 2 */
/* (void) pci_disable_pcie_error_reporting(pdev); */ /* init_one 1 */
kfree(h); /* init_one 1 */
hpda_free_ctlr_info(h); /* init_one 1 */
}
static int hpsa_suspend(__attribute__((unused)) struct pci_dev *pdev,

@ -158,6 +158,7 @@ struct bmic_controller_parameters {
#pragma pack()
struct ctlr_info {
unsigned int *reply_map;
int ctlr;
char devname[8];
char *product_name;

@ -2128,34 +2128,11 @@ __qla2x00_alloc_iocbs(struct qla_qpair *qpair, srb_t *sp)
req_cnt = 1;
handle = 0;
if (!sp)
goto skip_cmd_array;
/* Check for room in outstanding command list. */
handle = req->current_outstanding_cmd;
for (index = 1; index < req->num_outstanding_cmds; index++) {
handle++;
if (handle == req->num_outstanding_cmds)
handle = 1;
if (!req->outstanding_cmds[handle])
break;
}
if (index == req->num_outstanding_cmds) {
ql_log(ql_log_warn, vha, 0x700b,
"No room on outstanding cmd array.\n");
goto queuing_error;
}
/* Prep command array. */
req->current_outstanding_cmd = handle;
req->outstanding_cmds[handle] = sp;
sp->handle = handle;
/* Adjust entry-counts as needed. */
if (sp->type != SRB_SCSI_CMD)
if (sp && (sp->type != SRB_SCSI_CMD)) {
/* Adjust entry-counts as needed. */
req_cnt = sp->iocbs;
}
skip_cmd_array:
/* Check for room on request queue. */
if (req->cnt < req_cnt + 2) {
if (ha->mqenable || IS_QLA83XX(ha) || IS_QLA27XX(ha))
@ -2179,6 +2156,28 @@ skip_cmd_array:
if (req->cnt < req_cnt + 2)
goto queuing_error;
if (sp) {
/* Check for room in outstanding command list. */
handle = req->current_outstanding_cmd;
for (index = 1; index < req->num_outstanding_cmds; index++) {
handle++;
if (handle == req->num_outstanding_cmds)
handle = 1;
if (!req->outstanding_cmds[handle])
break;
}
if (index == req->num_outstanding_cmds) {
ql_log(ql_log_warn, vha, 0x700b,
"No room on outstanding cmd array.\n");
goto queuing_error;
}
/* Prep command array. */
req->current_outstanding_cmd = handle;
req->outstanding_cmds[handle] = sp;
sp->handle = handle;
}
/* Prep packet */
req->cnt -= req_cnt;
pkt = req->ring_ptr;
@ -2191,6 +2190,8 @@ skip_cmd_array:
pkt->handle = handle;
}
return pkt;
queuing_error:
qpair->tgt_counters.num_alloc_iocb_failed++;
return pkt;

@ -523,18 +523,26 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
static int sr_block_open(struct block_device *bdev, fmode_t mode)
{
struct scsi_cd *cd;
struct scsi_device *sdev;
int ret = -ENXIO;
cd = scsi_cd_get(bdev->bd_disk);
if (!cd)
goto out;
sdev = cd->device;
scsi_autopm_get_device(sdev);
check_disk_change(bdev);
mutex_lock(&sr_mutex);
cd = scsi_cd_get(bdev->bd_disk);
if (cd) {
ret = cdrom_open(&cd->cdi, bdev, mode);
if (ret)
scsi_cd_put(cd);
}
ret = cdrom_open(&cd->cdi, bdev, mode);
mutex_unlock(&sr_mutex);
scsi_autopm_put_device(sdev);
if (ret)
scsi_cd_put(cd);
out:
return ret;
}
@ -562,6 +570,8 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (ret)
goto out;
scsi_autopm_get_device(sdev);
/*
* Send SCSI addressing ioctls directly to mid level, send other
* ioctls to cdrom/block level.
@ -570,15 +580,18 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
ret = scsi_ioctl(sdev, cmd, argp);
goto out;
goto put;
}
ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg);
if (ret != -ENOSYS)
goto out;
goto put;
ret = scsi_ioctl(sdev, cmd, argp);
put:
scsi_autopm_put_device(sdev);
out:
mutex_unlock(&sr_mutex);
return ret;

@ -91,9 +91,6 @@ struct virtio_scsi_vq {
struct virtio_scsi_target_state {
seqcount_t tgt_seq;
/* Count of outstanding requests. */
atomic_t reqs;
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
};
@ -152,8 +149,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@ -210,8 +205,6 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
}
sc->scsi_done(sc);
atomic_dec(&tgt->reqs);
}
static void virtscsi_vq_done(struct virtio_scsi *vscsi,
@ -580,10 +573,7 @@ static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
atomic_inc(&tgt->reqs);
return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
}
@ -596,55 +586,11 @@ static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
return &vscsi->req_vqs[hwq];
}
static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
struct virtio_scsi_target_state *tgt)
{
struct virtio_scsi_vq *vq;
unsigned long flags;
u32 queue_num;
local_irq_save(flags);
if (atomic_inc_return(&tgt->reqs) > 1) {
unsigned long seq;
do {
seq = read_seqcount_begin(&tgt->tgt_seq);
vq = tgt->req_vq;
} while (read_seqcount_retry(&tgt->tgt_seq, seq));
} else {
/* no writes can be concurrent because of atomic_t */
write_seqcount_begin(&tgt->tgt_seq);
/* keep previous req_vq if a reader just arrived */
if (unlikely(atomic_read(&tgt->reqs) > 1)) {
vq = tgt->req_vq;
goto unlock;
}
queue_num = smp_processor_id();
while (unlikely(queue_num >= vscsi->num_queues))
queue_num -= vscsi->num_queues;
tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
unlock:
write_seqcount_end(&tgt->tgt_seq);
}
local_irq_restore(flags);
return vq;
}
static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
struct virtio_scsi_vq *req_vq;
if (shost_use_blk_mq(sh))
req_vq = virtscsi_pick_vq_mq(vscsi, sc);
else
req_vq = virtscsi_pick_vq(vscsi, tgt);
struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
return virtscsi_queuecommand(vscsi, req_vq, sc);
}
@ -775,7 +721,6 @@ static int virtscsi_target_alloc(struct scsi_target *starget)
return -ENOMEM;
seqcount_init(&tgt->tgt_seq);
atomic_set(&tgt->reqs, 0);
tgt->req_vq = &vscsi->req_vqs[0];
starget->hostdata = tgt;
@ -823,6 +768,7 @@ static struct scsi_host_template virtscsi_host_template_single = {
.target_alloc = virtscsi_target_alloc,
.target_destroy = virtscsi_target_destroy,
.track_queue_depth = 1,
.force_blk_mq = 1,
};
static struct scsi_host_template virtscsi_host_template_multi = {
@ -844,6 +790,7 @@ static struct scsi_host_template virtscsi_host_template_multi = {
.target_destroy = virtscsi_target_destroy,
.map_queues = virtscsi_map_queues,
.track_queue_depth = 1,
.force_blk_mq = 1,
};
#define virtscsi_config_get(vdev, fld) \

@ -357,14 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
bool hashed = !d_unhashed(dentry);
if (hashed)
raw_write_seqcount_begin(&dentry->d_seq);
raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
if (hashed)
raw_write_seqcount_end(&dentry->d_seq);
raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@ -1922,10 +1919,12 @@ struct dentry *d_make_root(struct inode *root_inode)
if (root_inode) {
res = __d_alloc(root_inode->i_sb, NULL);
if (res)
if (res) {
res->d_flags |= DCACHE_RCUACCESS;
d_instantiate(res, root_inode);
else
} else {
iput(root_inode);
}
}
return res;
}

@ -661,12 +661,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
smp_mb(); // see mntput_no_expire()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
mnt_add_count(mnt, -1);
return 1;
}
lock_mount_hash();
if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
mnt_add_count(mnt, -1);
unlock_mount_hash();
return 1;
}
unlock_mount_hash();
/* caller will mntput() */
return -1;
}
@ -1213,12 +1222,27 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
static void mntput_no_expire(struct mount *mnt)
{
rcu_read_lock();
mnt_add_count(mnt, -1);
if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
if (likely(READ_ONCE(mnt->mnt_ns))) {
/*
* Since we don't do lock_mount_hash() here,
* ->mnt_ns can change under us. However, if it's
* non-NULL, then there's a reference that won't
* be dropped until after an RCU delay done after
* turning ->mnt_ns NULL. So if we observe it
* non-NULL under rcu_read_lock(), the reference
* we are dropping is not the final one.
*/
mnt_add_count(mnt, -1);
rcu_read_unlock();
return;
}
lock_mount_hash();
/*
* make sure that if __legitimize_mnt() has not seen us grab
* mount_lock, we'll see their refcount increment here.
*/
smp_mb();
mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
rcu_read_unlock();
unlock_mount_hash();

@ -1055,6 +1055,18 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
static inline void init_espfix_bsp(void) { }
#endif
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
return true;
}
static inline bool arch_has_pfn_modify_check(void)
{
return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range

@ -17,6 +17,9 @@
*/
#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
#undef __no_sanitize_address
#define __no_sanitize_address __attribute__((no_sanitize("address")))
/* Clang doesn't have a way to turn it off per-function, yet. */
#ifdef __noretpoline
#undef __noretpoline

@ -30,7 +30,7 @@ struct cpu {
};
extern void boot_cpu_init(void);
extern void boot_cpu_state_init(void);
extern void boot_cpu_hotplug_init(void);
extern void cpu_init(void);
extern void trap_init(void);
@ -55,6 +55,8 @@ extern ssize_t cpu_show_spectre_v2(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_l1tf(struct device *dev,
struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
@ -176,4 +178,23 @@ void cpuhp_report_idle_dead(void);
static inline void cpuhp_report_idle_dead(void) { }
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
enum cpuhp_smt_control {
CPU_SMT_ENABLED,
CPU_SMT_DISABLED,
CPU_SMT_FORCE_DISABLED,
CPU_SMT_NOT_SUPPORTED,
};
#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
extern enum cpuhp_smt_control cpu_smt_control;
extern void cpu_smt_disable(bool force);
extern void cpu_smt_check_topology_early(void);
extern void cpu_smt_check_topology(void);
#else
# define cpu_smt_control (CPU_SMT_ENABLED)
static inline void cpu_smt_disable(bool force) { }
static inline void cpu_smt_check_topology_early(void) { }
static inline void cpu_smt_check_topology(void) { }
#endif
#endif /* _LINUX_CPU_H_ */

@ -10,5 +10,7 @@ extern spinlock_t swap_lock;
extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
extern unsigned long generic_max_swapfile_size(void);
extern unsigned long max_swapfile_size(void);
#endif /* _LINUX_SWAPFILE_H */

@ -452,6 +452,9 @@ struct scsi_host_template {
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
/* True if the low-level driver supports blk-mq only */
unsigned force_blk_mq:1;
/*
* Countdown for host blocking with no commands outstanding.
*/

@ -761,6 +761,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
#define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list)
/*
* Extension capability list.
@ -932,6 +933,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_HYPERV_SYNIC2 148
#define KVM_CAP_HYPERV_VP_INDEX 149
#define KVM_CAP_S390_BPB 152
#define KVM_CAP_GET_MSR_FEATURES 153
#ifdef KVM_CAP_IRQ_ROUTING

@ -543,8 +543,8 @@ asmlinkage __visible void __init start_kernel(void)
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
boot_cpu_state_init();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();
build_all_zonelists(NULL);
page_alloc_init();

@ -60,6 +60,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
bool booted_once;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@ -346,6 +347,85 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
EXPORT_SYMBOL_GPL(cpu_smt_control);
static bool cpu_smt_available __read_mostly;
void __init cpu_smt_disable(bool force)
{
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return;
if (force) {
pr_info("SMT: Force disabled\n");
cpu_smt_control = CPU_SMT_FORCE_DISABLED;
} else {
cpu_smt_control = CPU_SMT_DISABLED;
}
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code before non boot CPUs
* are brought up.
*/
void __init cpu_smt_check_topology_early(void)
{
if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
/*
* If SMT was disabled by BIOS, detect it here, after the CPUs have been
* brought online. This ensures the smt/l1tf sysfs entries are consistent
* with reality. cpu_smt_available is set to true during the bringup of non
* boot CPUs when a SMT sibling is detected. Note, this may overwrite
* cpu_smt_control's previous setting.
*/
void __init cpu_smt_check_topology(void)
{
if (!cpu_smt_available)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
static int __init smt_cmdline_disable(char *str)
{
cpu_smt_disable(str && !strcmp(str, "force"));
return 0;
}
early_param("nosmt", smt_cmdline_disable);
static inline bool cpu_smt_allowed(unsigned int cpu)
{
if (topology_is_primary_thread(cpu))
return true;
/*
* If the CPU is not a 'primary' thread and the booted_once bit is
* set then the processor has SMT support. Store this information
* for the late check of SMT support in cpu_smt_check_topology().
*/
if (per_cpu(cpuhp_state, cpu).booted_once)
cpu_smt_available = true;
if (cpu_smt_control == CPU_SMT_ENABLED)
return true;
/*
* On x86 it's required to boot all logical CPUs at least once so
* that the init code can get a chance to set CR4.MCE on each
* CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
* core will shutdown the machine.
*/
return !per_cpu(cpuhp_state, cpu).booted_once;
}
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
@ -426,6 +506,16 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
/*
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
* CPU marked itself as booted_once in cpu_notify_starting() so the
* cpu_smt_allowed() check will now return false if this is not the
* primary sibling.
*/
if (!cpu_smt_allowed(cpu))
return -ECANCELED;
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
@ -758,7 +848,6 @@ static int takedown_cpu(unsigned int cpu)
/* Park the smpboot threads */
kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
smpboot_park_threads(cpu);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
@ -911,20 +1000,19 @@ out:
return ret;
}
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
if (cpu_hotplug_disabled)
return -EBUSY;
return _cpu_down(cpu, 0, target);
}
static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
err = _cpu_down(cpu, 0, target);
out:
err = cpu_down_maps_locked(cpu, target);
cpu_maps_update_done();
return err;
}
@ -953,6 +1041,7 @@ void notify_cpu_starting(unsigned int cpu)
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
st->booted_once = true;
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
@ -1062,6 +1151,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
if (!cpu_smt_allowed(cpu)) {
err = -EPERM;
goto out;
}
err = _cpu_up(cpu, 0, target);
out:
@ -1351,7 +1444,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
.startup.single = smpboot_unpark_threads,
.teardown.single = NULL,
.teardown.single = smpboot_park_threads,
},
[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
.name = "irq/affinity:online",
@ -1925,10 +2018,172 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
NULL
};
#ifdef CONFIG_HOTPLUG_SMT
static const char *smt_states[] = {
[CPU_SMT_ENABLED] = "on",
[CPU_SMT_DISABLED] = "off",
[CPU_SMT_FORCE_DISABLED] = "forceoff",
[CPU_SMT_NOT_SUPPORTED] = "notsupported",
};
static ssize_t
show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
}
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
dev->offline = true;
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}
static void cpuhp_online_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
dev->offline = false;
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
cpu_maps_update_begin();
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
/*
* As this needs to hold the cpu maps lock it's impossible
* to call device_offline() because that ends up calling
* cpu_down() which takes cpu maps lock. cpu maps lock
* needs to be held as this might race against in kernel
* abusers of the hotplug machinery (thermal management).
*
* So nothing would update device:offline state. That would
* leave the sysfs entry stale and prevent onlining after
* smt control has been changed to 'off' again. This is
* called under the sysfs hotplug lock, so it is properly
* serialized against the regular offline usage.
*/
cpuhp_offline_cpu_device(cpu);
}
if (!ret)
cpu_smt_control = ctrlval;
cpu_maps_update_done();
return ret;
}
static int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
cpu_maps_update_begin();
cpu_smt_control = CPU_SMT_ENABLED;
for_each_present_cpu(cpu) {
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
/* See comment in cpuhp_smt_disable() */
cpuhp_online_cpu_device(cpu);
}
cpu_maps_update_done();
return ret;
}
static ssize_t
store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
int ctrlval, ret;
if (sysfs_streq(buf, "on"))
ctrlval = CPU_SMT_ENABLED;
else if (sysfs_streq(buf, "off"))
ctrlval = CPU_SMT_DISABLED;
else if (sysfs_streq(buf, "forceoff"))
ctrlval = CPU_SMT_FORCE_DISABLED;
else
return -EINVAL;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
if (ctrlval != cpu_smt_control) {
switch (ctrlval) {
case CPU_SMT_ENABLED:
ret = cpuhp_smt_enable();
break;
case CPU_SMT_DISABLED:
case CPU_SMT_FORCE_DISABLED:
ret = cpuhp_smt_disable(ctrlval);
break;
}
}
unlock_device_hotplug();
return ret ? ret : count;
}
static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
static ssize_t
show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
{
bool active = topology_max_smt_threads() > 1;
return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
}
static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
&dev_attr_active.attr,
NULL
};
static const struct attribute_group cpuhp_smt_attr_group = {
.attrs = cpuhp_smt_attrs,
.name = "smt",
NULL
};
static int __init cpu_smt_state_init(void)
{
return sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_smt_attr_group);
}
#else
static inline int cpu_smt_state_init(void) { return 0; }
#endif
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
ret = cpu_smt_state_init();
if (ret)
return ret;
ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_cpu_root_attr_group);
if (ret)
@ -2029,7 +2284,10 @@ void __init boot_cpu_init(void)
/*
* Must be called _AFTER_ setting up the per_cpu areas
*/
void __init boot_cpu_state_init(void)
void __init boot_cpu_hotplug_init(void)
{
per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
#ifdef CONFIG_SMP
this_cpu_write(cpuhp_state.booted_once, true);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}

@ -5678,6 +5678,18 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
#ifdef CONFIG_SCHED_SMT
/*
* The sched_smt_present static key needs to be evaluated on every
* hotplug event because at boot time SMT might be disabled when
* the number of booted CPUs is limited.
*
* If then later a sibling gets hotplugged, then the key would stay
* off and SMT scheduling would never be functional.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
static_branch_enable_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@ -5776,22 +5788,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static void sched_init_smt(void)
{
/*
* We've enumerated all CPUs and will assume that if any CPU
* has SMT siblings, CPU0 will too.
*/
if (cpumask_weight(cpu_smt_mask(0)) > 1)
static_branch_enable(&sched_smt_present);
}
#else
static inline void sched_init_smt(void) { }
#endif
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@ -5821,8 +5817,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
sched_init_smt();
sched_smp_initialized = true;
}

@ -6905,6 +6905,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
}
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{

@ -584,6 +584,8 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
/* Final decision about SMT support */
cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}

@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
/*
* If ksoftirqd is scheduled, we do not want to process pending softirqs
* right now. Let ksoftirqd handle this at its own rate, to get fairness.
* right now. Let ksoftirqd handle this at its own rate, to get fairness,
* unless we're doing some of the synchronous softirqs.
*/
static bool ksoftirqd_running(void)
#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
static bool ksoftirqd_running(unsigned long pending)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (pending & SOFTIRQ_NOW_MASK)
return false;
return tsk && (tsk->state == TASK_RUNNING);
}
@ -324,7 +328,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
if (pending && !ksoftirqd_running())
if (pending && !ksoftirqd_running(pending))
do_softirq_own_stack();
local_irq_restore(flags);
@ -351,7 +355,7 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
if (ksoftirqd_running())
if (ksoftirqd_running(local_softirq_pending()))
return;
if (!force_irqthreads) {

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save