Skip to content

Commit 9e03b7c

Browse files
committed
Merge tag 'kvm-x86-misc-6.20' of https://github.com/kvm-x86/linux into HEAD
KVM x86 misc changes for 6.20 - Disallow changing the virtual CPU model if L2 is active, for all the same reasons KVM disallows change the model after the first KVM_RUN. - Fix a bug where KVM would incorrectly reject host accesses to PV MSRs that were advertised as supported to userspace when running with KVM_CAP_ENFORCE_PV_FEATURE_CPUID enabled. - Fix a bug where KVM would attempt to read protect guest state (CR3) when configuring an async #PF entry. - Fail the build if EXPORT_SYMBOL_GPL or EXPORT_SYMBOL is used in KVM (for x86 only) to enforce usage of EXPORT_SYMBOL_FOR_KVM_INTERNAL. Explicitly allow the few exports that are intended for external usage. - Ignore -EBUSY when checking nested events after a vCPU exits blocking as the WARN is user-triggerable, and because exiting to userspace on -EBUSY does more harm than good in pretty much every situation. - Throw in the towel and drop the WARN on INIT/SIPI being blocked when vCPU is in Wait-For-SIPI, as playing whack-a-mole with syzkaller turned out to be an unwinnable game. - Add support for new Intel instructions that don't require anything beyond enumerating feature flags to userspace. - Grab SRCU when reading PDPTRs in KVM_GET_SREGS2. - Add WARNs to guard against modifying KVM's CPU caps outside of the intended setup flow, as nested VMX in particular is sensitive to unexpected changes in KVM's golden configuration. - Add a quirk to allow userspace to opt-in to actually suppress EOI broadcasts when the suppression feature is enabled by the guest (currently limited to split IRQCHIP, i.e. userspace I/O APIC). Sadly, simply fixing KVM to honor Suppress EOI Broadcasts isn't an option as some userspaces have come to rely on KVM's buggy behavior (KVM advertises Supress EOI Broadcast irrespective of whether or not userspace I/O APIC supports Directed EOIs). - Minor cleanups.
2 parents 4215ee0 + 6517dfb commit 9e03b7c

File tree

17 files changed

+326
-69
lines changed

17 files changed

+326
-69
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7908,8 +7908,10 @@ Will return -EBUSY if a VCPU has already been created.
79087908

79097909
Valid feature flags in args[0] are::
79107910

7911-
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
7912-
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
7911+
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
7912+
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
7913+
#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2)
7914+
#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3)
79137915

79147916
Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
79157917
KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
@@ -7922,6 +7924,28 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC
79227924
without interrupt remapping. This is undesirable in logical mode,
79237925
where 0xff represents CPUs 0-7 in cluster 0.
79247926

7927+
Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable
7928+
Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI
7929+
Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest
7930+
sets the Suppress EOI Broadcast bit in the SPIV register. This flag is
7931+
supported only when using a split IRQCHIP.
7932+
7933+
Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for
7934+
Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise
7935+
support to the guest.
7936+
7937+
Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST
7938+
or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky
7939+
behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise
7940+
support for Suppress EOI Broadcasts but not actually suppress EOI
7941+
broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for
7942+
Suppress EOI Broadcasts.
7943+
7944+
Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
7945+
KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error,
7946+
as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split
7947+
IRCHIP.
7948+
79257949
7.8 KVM_CAP_S390_USER_INSTR0
79267950
----------------------------
79277951

arch/x86/include/asm/cpufeatures.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@
326326
#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
327327
#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
328328
#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */
329+
#define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */
329330

330331
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
331332
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */

arch/x86/include/asm/kvm_host.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,8 @@ enum kvm_only_cpuid_leafs {
784784
CPUID_24_0_EBX,
785785
CPUID_8000_0021_ECX,
786786
CPUID_7_1_ECX,
787+
CPUID_1E_1_EAX,
788+
CPUID_24_1_ECX,
787789
NR_KVM_CPU_CAPS,
788790

789791
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -1234,6 +1236,12 @@ enum kvm_irqchip_mode {
12341236
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
12351237
};
12361238

1239+
enum kvm_suppress_eoi_broadcast_mode {
1240+
KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */
1241+
KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */
1242+
KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */
1243+
};
1244+
12371245
struct kvm_x86_msr_filter {
12381246
u8 count;
12391247
bool default_allow:1;
@@ -1483,6 +1491,7 @@ struct kvm_arch {
14831491

14841492
bool x2apic_format;
14851493
bool x2apic_broadcast_quirk_disabled;
1494+
enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode;
14861495

14871496
bool has_mapped_host_mmio;
14881497
bool guest_can_read_msr_platform_info;

arch/x86/include/uapi/asm/kvm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -916,8 +916,10 @@ struct kvm_sev_snp_launch_finish {
916916
__u64 pad1[4];
917917
};
918918

919-
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
920-
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
919+
#define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0)
920+
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1)
921+
#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2)
922+
#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3)
921923

922924
struct kvm_hyperv_eventfd {
923925
__u32 conn_id;

arch/x86/kvm/Makefile

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,52 @@ $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
4747

4848
targets += kvm-asm-offsets.s
4949
clean-files += kvm-asm-offsets.h
50+
51+
52+
# Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL)
53+
# usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL.
54+
# Only a handful of exports intended for other modules (VFIO, KVMGT) should
55+
# use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used.
56+
ifdef CONFIG_KVM_X86
57+
# Search recursively for whole words and print line numbers. Filter out the
58+
# allowed set of exports, i.e. those that are intended for external usage.
59+
exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \
60+
grep -v -e kvm_page_track_register_notifier \
61+
-e kvm_page_track_unregister_notifier \
62+
-e kvm_write_track_add_gfn \
63+
-e kvm_write_track_remove_gfn \
64+
-e kvm_get_kvm \
65+
-e kvm_get_kvm_safe \
66+
-e kvm_put_kvm
67+
68+
# Force grep to emit a goofy group separator that can in turn be replaced with
69+
# the above newline macro (newlines in Make are a nightmare). Note, grep only
70+
# prints the group separator when N lines of context are requested via -C,
71+
# a.k.a. --NUM. Simply request zero lines. Print the separator only after
72+
# filtering out expected exports to avoid extra newlines in the error message.
73+
define get_kvm_exports
74+
$(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!")
75+
endef
76+
77+
define check_kvm_exports
78+
nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l)
79+
80+
ifneq (0,$$(nr_kvm_exports))
81+
$$(error ERROR ***\
82+
$$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\
83+
$$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\
84+
$$(newline)in directories:\
85+
$$(newline) $(srctree)/arch/x86/kvm\
86+
$$(newline) $(srctree)/virt/kvm\
87+
$$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1))
88+
endif # nr_kvm_exports != 0
89+
undefine nr_kvm_exports
90+
endef # check_kvm_exports
91+
92+
$(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL))
93+
$(eval $(call check_kvm_exports,EXPORT_SYMBOL))
94+
95+
undefine check_kvm_exports
96+
undefine get_kvm_exports
97+
undefine exports_grep_trailer
98+
endif # CONFIG_KVM_X86

arch/x86/kvm/cpuid.c

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
3737
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps);
3838

39+
bool kvm_is_configuring_cpu_caps __read_mostly;
40+
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps);
41+
3942
struct cpuid_xstate_sizes {
4043
u32 eax;
4144
u32 ebx;
@@ -534,17 +537,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
534537
BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
535538

536539
/*
537-
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
538-
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
539-
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
540-
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
541-
* the core vCPU model on the fly. It would've been better to forbid any
542-
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
543-
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
540+
* KVM does not correctly handle changing guest CPUID after KVM_RUN or
541+
* while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit
542+
* behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state
543+
* can't be adjusted (without breaking L2 in some way). As a result,
544+
* KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state.
545+
*
546+
* In practice, no sane VMM mucks with the core vCPU model on the fly.
547+
* It would've been better to forbid any KVM_SET_CPUID{,2} calls after
548+
* KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some
549+
* VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
544550
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
545551
* whether the supplied CPUID data is equal to what's already set.
546552
*/
547-
if (kvm_vcpu_has_run(vcpu)) {
553+
if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) {
548554
r = kvm_cpuid_check_equal(vcpu, e2, nent);
549555
if (r)
550556
goto err;
@@ -823,10 +829,13 @@ do { \
823829
/* DS is defined by ptrace-abi.h on 32-bit builds. */
824830
#undef DS
825831

826-
void kvm_set_cpu_caps(void)
832+
void kvm_initialize_cpu_caps(void)
827833
{
828834
memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
829835

836+
WARN_ON_ONCE(kvm_is_configuring_cpu_caps);
837+
kvm_is_configuring_cpu_caps = true;
838+
830839
BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
831840
sizeof(boot_cpu_data.x86_capability));
832841

@@ -1025,6 +1034,7 @@ void kvm_set_cpu_caps(void)
10251034
F(AMX_FP16),
10261035
F(AVX_IFMA),
10271036
F(LAM),
1037+
F(MOVRS),
10281038
);
10291039

10301040
kvm_cpu_cap_init(CPUID_7_1_ECX,
@@ -1063,12 +1073,27 @@ void kvm_set_cpu_caps(void)
10631073
SCATTERED_F(SGX_EDECCSSA),
10641074
);
10651075

1076+
kvm_cpu_cap_init(CPUID_1E_1_EAX,
1077+
F(AMX_INT8_ALIAS),
1078+
F(AMX_BF16_ALIAS),
1079+
F(AMX_COMPLEX_ALIAS),
1080+
F(AMX_FP16_ALIAS),
1081+
F(AMX_FP8),
1082+
F(AMX_TF32),
1083+
F(AMX_AVX512),
1084+
F(AMX_MOVRS),
1085+
);
1086+
10661087
kvm_cpu_cap_init(CPUID_24_0_EBX,
10671088
F(AVX10_128),
10681089
F(AVX10_256),
10691090
F(AVX10_512),
10701091
);
10711092

1093+
kvm_cpu_cap_init(CPUID_24_1_ECX,
1094+
F(AVX10_VNNI_INT),
1095+
);
1096+
10721097
kvm_cpu_cap_init(CPUID_8000_0001_ECX,
10731098
F(LAHF_LM),
10741099
F(CMP_LEGACY),
@@ -1270,7 +1295,7 @@ void kvm_set_cpu_caps(void)
12701295
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
12711296
}
12721297
}
1273-
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps);
1298+
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps);
12741299

12751300
#undef F
12761301
#undef SCATTERED_F
@@ -1624,6 +1649,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
16241649
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
16251650
break;
16261651
}
1652+
1653+
max_idx = entry->eax = min(entry->eax, 1u);
1654+
1655+
/* KVM only supports up to 0x1e.0x1, capped above via min(). */
1656+
if (max_idx >= 1) {
1657+
entry = do_host_cpuid(array, function, 1);
1658+
if (!entry)
1659+
goto out;
1660+
1661+
cpuid_entry_override(entry, CPUID_1E_1_EAX);
1662+
entry->ebx = 0;
1663+
entry->ecx = 0;
1664+
entry->edx = 0;
1665+
}
16271666
break;
16281667
case 0x24: {
16291668
u8 avx10_version;
@@ -1633,18 +1672,30 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
16331672
break;
16341673
}
16351674

1675+
max_idx = entry->eax = min(entry->eax, 1u);
16361676
/*
16371677
* The AVX10 version is encoded in EBX[7:0]. Note, the version
16381678
* is guaranteed to be >=1 if AVX10 is supported. Note #2, the
16391679
* version needs to be captured before overriding EBX features!
16401680
*/
1641-
avx10_version = min_t(u8, entry->ebx & 0xff, 1);
1681+
avx10_version = min_t(u8, entry->ebx & 0xff, 2);
16421682
cpuid_entry_override(entry, CPUID_24_0_EBX);
16431683
entry->ebx |= avx10_version;
16441684

1645-
entry->eax = 0;
16461685
entry->ecx = 0;
16471686
entry->edx = 0;
1687+
1688+
/* KVM only supports up to 0x24.0x1, capped above via min(). */
1689+
if (max_idx >= 1) {
1690+
entry = do_host_cpuid(array, function, 1);
1691+
if (!entry)
1692+
goto out;
1693+
1694+
cpuid_entry_override(entry, CPUID_24_1_ECX);
1695+
entry->eax = 0;
1696+
entry->ebx = 0;
1697+
entry->edx = 0;
1698+
}
16481699
break;
16491700
}
16501701
case KVM_CPUID_SIGNATURE: {

arch/x86/kvm/cpuid.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,15 @@
88
#include <uapi/asm/kvm_para.h>
99

1010
extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
11-
void kvm_set_cpu_caps(void);
11+
extern bool kvm_is_configuring_cpu_caps __read_mostly;
12+
13+
void kvm_initialize_cpu_caps(void);
14+
15+
static inline void kvm_finalize_cpu_caps(void)
16+
{
17+
WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
18+
kvm_is_configuring_cpu_caps = false;
19+
}
1220

1321
void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
1422
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
@@ -188,13 +196,15 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
188196
{
189197
unsigned int x86_leaf = __feature_leaf(x86_feature);
190198

199+
WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
191200
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
192201
}
193202

194203
static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
195204
{
196205
unsigned int x86_leaf = __feature_leaf(x86_feature);
197206

207+
WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
198208
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
199209
}
200210

arch/x86/kvm/ioapic.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
561561
spin_lock(&ioapic->lock);
562562

563563
if (trigger_mode != IOAPIC_LEVEL_TRIG ||
564-
kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
564+
kvm_lapic_suppress_eoi_broadcast(apic))
565565
return;
566566

567567
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);

0 commit comments

Comments
 (0)