Skip to content

Commit d1e7b46

Browse files
committed
Merge tag 'kvm-x86-vmx-6.19' of https://github.com/kvm-x86/linux into HEAD
KVM VMX changes for 6.19: - Use the root role from kvm_mmu_page to construct EPTPs instead of the current vCPU state, partly as worthwhile cleanup, but mostly to pave the way for tracking per-root TLB flushes so that KVM can elide EPT flushes on pCPU migration if KVM has flushed the root at least once. - Add a few missing nested consistency checks. - Rip out support for doing "early" consistency checks via hardware as the functionality hasn't been used in years and is no longer useful in general, and replace it with an off-by-default module param to detected missed consistency checks (i.e. WARN if hardware finds a check that KVM does not). - Fix a currently-benign bug where KVM would drop the guest's SPEC_CTRL[63:32] on VM-Enter. - Misc cleanups.
2 parents de8e8eb + dfd1572 commit d1e7b46

File tree

8 files changed

+134
-169
lines changed

8 files changed

+134
-169
lines changed

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,6 @@
3939
#define INVALID_PAE_ROOT 0
4040
#define IS_VALID_PAE_ROOT(x) (!!(x))
4141

42-
static inline hpa_t kvm_mmu_get_dummy_root(void)
43-
{
44-
return my_zero_pfn(0) << PAGE_SHIFT;
45-
}
46-
47-
static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
48-
{
49-
return is_zero_pfn(shadow_page >> PAGE_SHIFT);
50-
}
51-
5242
typedef u64 __rcu *tdp_ptep_t;
5343

5444
struct kvm_mmu_page {

arch/x86/kvm/mmu/spte.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep)
246246
*/
247247
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
248248

249+
static inline hpa_t kvm_mmu_get_dummy_root(void)
250+
{
251+
return my_zero_pfn(0) << PAGE_SHIFT;
252+
}
253+
254+
static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
255+
{
256+
return is_zero_pfn(shadow_page >> PAGE_SHIFT);
257+
}
258+
249259
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
250260
{
251261
struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);

arch/x86/kvm/vmx/nested.c

Lines changed: 60 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
static bool __read_mostly enable_shadow_vmcs = 1;
2424
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
2525

26-
static bool __read_mostly nested_early_check = 0;
27-
module_param(nested_early_check, bool, S_IRUGO);
26+
static bool __ro_after_init warn_on_missed_cc;
27+
module_param(warn_on_missed_cc, bool, 0444);
2828

2929
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
3030

@@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
555555
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
556556
return -EINVAL;
557557

558+
if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
559+
return -EINVAL;
560+
558561
return 0;
559562
}
560563

@@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
761764
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
762765
return;
763766

764-
kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
767+
kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
765768
VMCS12_SIZE);
766769
}
767770

@@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
780783
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
781784
return;
782785

783-
kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
786+
kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
784787
VMCS12_SIZE);
785788
}
786789

@@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
22962299
return;
22972300
vmx->nested.vmcs02_initialized = true;
22982301

2299-
/*
2300-
* We don't care what the EPTP value is we just need to guarantee
2301-
* it's valid so we don't get a false positive when doing early
2302-
* consistency checks.
2303-
*/
2304-
if (enable_ept && nested_early_check)
2305-
vmcs_write64(EPT_POINTER,
2306-
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2307-
23082302
if (vmx->ve_info)
23092303
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
23102304

@@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
27492743
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
27502744
vcpu->arch.pat = vmcs12->guest_ia32_pat;
27512745
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2752-
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2746+
vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
27532747
}
27542748

27552749
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
@@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
29612955
}
29622956
}
29632957

2958+
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
2959+
CC(!vmcs12->tsc_multiplier))
2960+
return -EINVAL;
2961+
29642962
return 0;
29652963
}
29662964

@@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
30783076
return 0;
30793077
}
30803078

3079+
static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
3080+
struct vmcs12 *vmcs12)
3081+
{
3082+
void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
3083+
u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;
3084+
3085+
/*
3086+
* Don't bother with the consistency checks if KVM isn't configured to
3087+
* WARN on missed consistency checks, as KVM needs to rely on hardware
3088+
* to fully detect an illegal vTPR vs. TRP Threshold combination due to
3089+
* the vTPR being writable by L1 at all times (it's an in-memory value,
3090+
* not a VMCS field). I.e. even if the check passes now, it might fail
3091+
* at the actual VM-Enter.
3092+
*
3093+
* Keying off the module param also allows treating an invalid vAPIC
3094+
* mapping as a consistency check failure without increasing the risk
3095+
* of breaking a "real" VM.
3096+
*/
3097+
if (!warn_on_missed_cc)
3098+
return 0;
3099+
3100+
if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
3101+
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
3102+
!nested_cpu_has_vid(vmcs12) &&
3103+
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
3104+
(CC(!vapic) ||
3105+
CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
3106+
return -EINVAL;
3107+
3108+
return 0;
3109+
}
3110+
30813111
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
30823112
struct vmcs12 *vmcs12)
30833113
{
@@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
33333363
return 0;
33343364
}
33353365

3336-
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3337-
{
3338-
struct vcpu_vmx *vmx = to_vmx(vcpu);
3339-
unsigned long cr3, cr4;
3340-
bool vm_fail;
3341-
3342-
if (!nested_early_check)
3343-
return 0;
3344-
3345-
if (vmx->msr_autoload.host.nr)
3346-
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3347-
if (vmx->msr_autoload.guest.nr)
3348-
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3349-
3350-
preempt_disable();
3351-
3352-
vmx_prepare_switch_to_guest(vcpu);
3353-
3354-
/*
3355-
* Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3356-
* which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3357-
* be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3358-
* there is no need to preserve other bits or save/restore the field.
3359-
*/
3360-
vmcs_writel(GUEST_RFLAGS, 0);
3361-
3362-
cr3 = __get_current_cr3_fast();
3363-
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3364-
vmcs_writel(HOST_CR3, cr3);
3365-
vmx->loaded_vmcs->host_state.cr3 = cr3;
3366-
}
3367-
3368-
cr4 = cr4_read_shadow();
3369-
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3370-
vmcs_writel(HOST_CR4, cr4);
3371-
vmx->loaded_vmcs->host_state.cr4 = cr4;
3372-
}
3373-
3374-
vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3375-
__vmx_vcpu_run_flags(vmx));
3376-
3377-
if (vmx->msr_autoload.host.nr)
3378-
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3379-
if (vmx->msr_autoload.guest.nr)
3380-
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3381-
3382-
if (vm_fail) {
3383-
u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3384-
3385-
preempt_enable();
3386-
3387-
trace_kvm_nested_vmenter_failed(
3388-
"early hardware check VM-instruction error: ", error);
3389-
WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3390-
return 1;
3391-
}
3392-
3393-
/*
3394-
* VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3395-
*/
3396-
if (hw_breakpoint_active())
3397-
set_debugreg(__this_cpu_read(cpu_dr7), 7);
3398-
local_irq_enable();
3399-
preempt_enable();
3400-
3401-
/*
3402-
* A non-failing VMEntry means we somehow entered guest mode with
3403-
* an illegal RIP, and that's just the tip of the iceberg. There
3404-
* is no telling what memory has been modified or what state has
3405-
* been exposed to unknown code. Hitting this all but guarantees
3406-
* a (very critical) hardware issue.
3407-
*/
3408-
WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3409-
VMX_EXIT_REASONS_FAILED_VMENTRY));
3410-
3411-
return 0;
3412-
}
3413-
34143366
#ifdef CONFIG_KVM_HYPERV
34153367
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
34163368
{
@@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
36673619
&vmx->nested.pre_vmenter_ssp_tbl);
36683620

36693621
/*
3670-
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3671-
* nested early checks are disabled. In the event of a "late" VM-Fail,
3672-
* i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3673-
* software model to the pre-VMEntry host state. When EPT is disabled,
3674-
* GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3675-
* nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3676-
* vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3677-
* the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3678-
* VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3679-
* guaranteed to be overwritten with a shadow CR3 prior to re-entering
3680-
* L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3681-
* KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3682-
* pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3683-
* path would need to manually save/restore vmcs01.GUEST_CR3.
3622+
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
3623+
* event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
3624+
* not KVM, KVM must unwind its software model to the pre-VM-Entry host
3625+
* state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
3626+
* L1's "real" CR3, which causes nested_vmx_restore_host_state() to
3627+
* corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
3628+
* unwind naturally setting arch.cr3 to the correct value. Smashing
3629+
* vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
3630+
* reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
3631+
* overwritten with a shadow CR3 prior to re-entering L1.
36843632
*/
3685-
if (!enable_ept && !nested_early_check)
3633+
if (!enable_ept)
36863634
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
36873635

36883636
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
@@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
36953643
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
36963644
}
36973645

3698-
if (nested_vmx_check_vmentry_hw(vcpu)) {
3646+
if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
36993647
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
37003648
return NVMX_VMENTRY_VMFAIL;
37013649
}
@@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
51645112
/*
51655113
* The only expected VM-instruction error is "VM entry with
51665114
* invalid control field(s)." Anything else indicates a
5167-
* problem with L0. And we should never get here with a
5168-
* VMFail of any type if early consistency checks are enabled.
5115+
* problem with L0.
51695116
*/
51705117
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
51715118
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5172-
WARN_ON_ONCE(nested_early_check);
5119+
5120+
/* VM-Fail at VM-Entry means KVM missed a consistency check. */
5121+
WARN_ON_ONCE(warn_on_missed_cc);
51735122
}
51745123

51755124
/*

arch/x86/kvm/vmx/tdx.c

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2706,9 +2706,11 @@ DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
27062706

27072707
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
27082708
{
2709+
struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
27092710
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
27102711
struct kvm_tdx_init_vm *init_vm;
27112712
struct td_params *td_params = NULL;
2713+
u32 nr_user_entries;
27122714
int ret;
27132715

27142716
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
@@ -2720,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
27202722
if (cmd->flags)
27212723
return -EINVAL;
27222724

2723-
init_vm = kmalloc(sizeof(*init_vm) +
2724-
sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2725-
GFP_KERNEL);
2726-
if (!init_vm)
2727-
return -ENOMEM;
2728-
2729-
if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2730-
ret = -EFAULT;
2731-
goto out;
2732-
}
2725+
if (get_user(nr_user_entries, &user_data->cpuid.nent))
2726+
return -EFAULT;
27332727

2734-
if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2735-
ret = -E2BIG;
2736-
goto out;
2737-
}
2728+
if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
2729+
return -E2BIG;
27382730

2739-
if (copy_from_user(init_vm->cpuid.entries,
2740-
u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2741-
flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2742-
ret = -EFAULT;
2743-
goto out;
2744-
}
2731+
init_vm = memdup_user(user_data,
2732+
struct_size(user_data, cpuid.entries, nr_user_entries));
2733+
if (IS_ERR(init_vm))
2734+
return PTR_ERR(init_vm);
27452735

27462736
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
27472737
ret = -EINVAL;

arch/x86/kvm/vmx/vmenter.S

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
116116
* and vmentry.
117117
*/
118118
mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
119-
movl VMX_spec_ctrl(%_ASM_DI), %edi
120-
movl PER_CPU_VAR(x86_spec_ctrl_current), %esi
121-
cmp %edi, %esi
119+
#ifdef CONFIG_X86_64
120+
mov VMX_spec_ctrl(%rdi), %rdx
121+
cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
122+
je .Lspec_ctrl_done
123+
movl %edx, %eax
124+
shr $32, %rdx
125+
#else
126+
mov VMX_spec_ctrl(%edi), %eax
127+
mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
128+
xor %eax, %ecx
129+
mov VMX_spec_ctrl + 4(%edi), %edx
130+
mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
131+
xor %edx, %edi
132+
or %edi, %ecx
122133
je .Lspec_ctrl_done
134+
#endif
123135
mov $MSR_IA32_SPEC_CTRL, %ecx
124-
xor %edx, %edx
125-
mov %edi, %eax
126136
wrmsr
127137

128138
.Lspec_ctrl_done:

0 commit comments

Comments
 (0)