2323static bool __read_mostly enable_shadow_vmcs = 1 ;
2424module_param_named (enable_shadow_vmcs , enable_shadow_vmcs , bool , S_IRUGO );
2525
26- static bool __read_mostly nested_early_check = 0 ;
27- module_param (nested_early_check , bool , S_IRUGO );
26+ static bool __ro_after_init warn_on_missed_cc ;
27+ module_param (warn_on_missed_cc , bool , 0444 );
2828
2929#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
3030
@@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
555555 if (CC (!page_address_valid (vcpu , vmcs12 -> virtual_apic_page_addr )))
556556 return - EINVAL ;
557557
558+ if (CC (!nested_cpu_has_vid (vmcs12 ) && vmcs12 -> tpr_threshold >> 4 ))
559+ return - EINVAL ;
560+
558561 return 0 ;
559562}
560563
@@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
761764 vmcs12 -> vmcs_link_pointer , VMCS12_SIZE ))
762765 return ;
763766
764- kvm_read_guest_cached (vmx -> vcpu . kvm , ghc , get_shadow_vmcs12 (vcpu ),
767+ kvm_read_guest_cached (vcpu -> kvm , ghc , get_shadow_vmcs12 (vcpu ),
765768 VMCS12_SIZE );
766769}
767770
@@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
780783 vmcs12 -> vmcs_link_pointer , VMCS12_SIZE ))
781784 return ;
782785
783- kvm_write_guest_cached (vmx -> vcpu . kvm , ghc , get_shadow_vmcs12 (vcpu ),
786+ kvm_write_guest_cached (vcpu -> kvm , ghc , get_shadow_vmcs12 (vcpu ),
784787 VMCS12_SIZE );
785788}
786789
@@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
22962299 return ;
22972300 vmx -> nested .vmcs02_initialized = true;
22982301
2299- /*
2300- * We don't care what the EPTP value is we just need to guarantee
2301- * it's valid so we don't get a false positive when doing early
2302- * consistency checks.
2303- */
2304- if (enable_ept && nested_early_check )
2305- vmcs_write64 (EPT_POINTER ,
2306- construct_eptp (& vmx -> vcpu , 0 , PT64_ROOT_4LEVEL ));
2307-
23082302 if (vmx -> ve_info )
23092303 vmcs_write64 (VE_INFORMATION_ADDRESS , __pa (vmx -> ve_info ));
23102304
@@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
27492743 vmcs_write64 (GUEST_IA32_PAT , vmcs12 -> guest_ia32_pat );
27502744 vcpu -> arch .pat = vmcs12 -> guest_ia32_pat ;
27512745 } else if (vmcs_config .vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT ) {
2752- vmcs_write64 (GUEST_IA32_PAT , vmx -> vcpu . arch .pat );
2746+ vmcs_write64 (GUEST_IA32_PAT , vcpu -> arch .pat );
27532747 }
27542748
27552749 vcpu -> arch .tsc_offset = kvm_calc_nested_tsc_offset (
@@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
29612955 }
29622956 }
29632957
2958+ if (nested_cpu_has2 (vmcs12 , SECONDARY_EXEC_TSC_SCALING ) &&
2959+ CC (!vmcs12 -> tsc_multiplier ))
2960+ return - EINVAL ;
2961+
29642962 return 0 ;
29652963}
29662964
@@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
30783076 return 0 ;
30793077}
30803078
3079+ static int nested_vmx_check_controls_late (struct kvm_vcpu * vcpu ,
3080+ struct vmcs12 * vmcs12 )
3081+ {
3082+ void * vapic = to_vmx (vcpu )-> nested .virtual_apic_map .hva ;
3083+ u32 vtpr = vapic ? (* (u32 * )(vapic + APIC_TASKPRI )) >> 4 : 0 ;
3084+
3085+ /*
3086+ * Don't bother with the consistency checks if KVM isn't configured to
3087+ * WARN on missed consistency checks, as KVM needs to rely on hardware
3088+ * to fully detect an illegal vTPR vs. TRP Threshold combination due to
3089+ * the vTPR being writable by L1 at all times (it's an in-memory value,
3090+ * not a VMCS field). I.e. even if the check passes now, it might fail
3091+ * at the actual VM-Enter.
3092+ *
3093+ * Keying off the module param also allows treating an invalid vAPIC
3094+ * mapping as a consistency check failure without increasing the risk
3095+ * of breaking a "real" VM.
3096+ */
3097+ if (!warn_on_missed_cc )
3098+ return 0 ;
3099+
3100+ if ((exec_controls_get (to_vmx (vcpu )) & CPU_BASED_TPR_SHADOW ) &&
3101+ nested_cpu_has (vmcs12 , CPU_BASED_TPR_SHADOW ) &&
3102+ !nested_cpu_has_vid (vmcs12 ) &&
3103+ !nested_cpu_has2 (vmcs12 , SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES ) &&
3104+ (CC (!vapic ) ||
3105+ CC ((vmcs12 -> tpr_threshold & GENMASK (3 , 0 )) > (vtpr & GENMASK (3 , 0 )))))
3106+ return - EINVAL ;
3107+
3108+ return 0 ;
3109+ }
3110+
30813111static int nested_vmx_check_address_space_size (struct kvm_vcpu * vcpu ,
30823112 struct vmcs12 * vmcs12 )
30833113{
@@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
33333363 return 0 ;
33343364}
33353365
3336- static int nested_vmx_check_vmentry_hw (struct kvm_vcpu * vcpu )
3337- {
3338- struct vcpu_vmx * vmx = to_vmx (vcpu );
3339- unsigned long cr3 , cr4 ;
3340- bool vm_fail ;
3341-
3342- if (!nested_early_check )
3343- return 0 ;
3344-
3345- if (vmx -> msr_autoload .host .nr )
3346- vmcs_write32 (VM_EXIT_MSR_LOAD_COUNT , 0 );
3347- if (vmx -> msr_autoload .guest .nr )
3348- vmcs_write32 (VM_ENTRY_MSR_LOAD_COUNT , 0 );
3349-
3350- preempt_disable ();
3351-
3352- vmx_prepare_switch_to_guest (vcpu );
3353-
3354- /*
3355- * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3356- * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3357- * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3358- * there is no need to preserve other bits or save/restore the field.
3359- */
3360- vmcs_writel (GUEST_RFLAGS , 0 );
3361-
3362- cr3 = __get_current_cr3_fast ();
3363- if (unlikely (cr3 != vmx -> loaded_vmcs -> host_state .cr3 )) {
3364- vmcs_writel (HOST_CR3 , cr3 );
3365- vmx -> loaded_vmcs -> host_state .cr3 = cr3 ;
3366- }
3367-
3368- cr4 = cr4_read_shadow ();
3369- if (unlikely (cr4 != vmx -> loaded_vmcs -> host_state .cr4 )) {
3370- vmcs_writel (HOST_CR4 , cr4 );
3371- vmx -> loaded_vmcs -> host_state .cr4 = cr4 ;
3372- }
3373-
3374- vm_fail = __vmx_vcpu_run (vmx , (unsigned long * )& vcpu -> arch .regs ,
3375- __vmx_vcpu_run_flags (vmx ));
3376-
3377- if (vmx -> msr_autoload .host .nr )
3378- vmcs_write32 (VM_EXIT_MSR_LOAD_COUNT , vmx -> msr_autoload .host .nr );
3379- if (vmx -> msr_autoload .guest .nr )
3380- vmcs_write32 (VM_ENTRY_MSR_LOAD_COUNT , vmx -> msr_autoload .guest .nr );
3381-
3382- if (vm_fail ) {
3383- u32 error = vmcs_read32 (VM_INSTRUCTION_ERROR );
3384-
3385- preempt_enable ();
3386-
3387- trace_kvm_nested_vmenter_failed (
3388- "early hardware check VM-instruction error: " , error );
3389- WARN_ON_ONCE (error != VMXERR_ENTRY_INVALID_CONTROL_FIELD );
3390- return 1 ;
3391- }
3392-
3393- /*
3394- * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3395- */
3396- if (hw_breakpoint_active ())
3397- set_debugreg (__this_cpu_read (cpu_dr7 ), 7 );
3398- local_irq_enable ();
3399- preempt_enable ();
3400-
3401- /*
3402- * A non-failing VMEntry means we somehow entered guest mode with
3403- * an illegal RIP, and that's just the tip of the iceberg. There
3404- * is no telling what memory has been modified or what state has
3405- * been exposed to unknown code. Hitting this all but guarantees
3406- * a (very critical) hardware issue.
3407- */
3408- WARN_ON (!(vmcs_read32 (VM_EXIT_REASON ) &
3409- VMX_EXIT_REASONS_FAILED_VMENTRY ));
3410-
3411- return 0 ;
3412- }
3413-
34143366#ifdef CONFIG_KVM_HYPERV
34153367static bool nested_get_evmcs_page (struct kvm_vcpu * vcpu )
34163368{
@@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
36673619 & vmx -> nested .pre_vmenter_ssp_tbl );
36683620
36693621 /*
3670- * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3671- * nested early checks are disabled. In the event of a "late" VM-Fail,
3672- * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3673- * software model to the pre-VMEntry host state. When EPT is disabled,
3674- * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3675- * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3676- * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3677- * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3678- * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3679- * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3680- * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3681- * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3682- * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3683- * path would need to manually save/restore vmcs01.GUEST_CR3.
3622+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
3623+ * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
3624+ * not KVM, KVM must unwind its software model to the pre-VM-Entry host
3625+ * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
3626+ * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
3627+ * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
3628+ * unwind naturally setting arch.cr3 to the correct value. Smashing
3629+ * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
3630+ * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
3631+ * overwritten with a shadow CR3 prior to re-entering L1.
36843632 */
3685- if (!enable_ept && ! nested_early_check )
3633+ if (!enable_ept )
36863634 vmcs_writel (GUEST_CR3 , vcpu -> arch .cr3 );
36873635
36883636 vmx_switch_vmcs (vcpu , & vmx -> nested .vmcs02 );
@@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
36953643 return NVMX_VMENTRY_KVM_INTERNAL_ERROR ;
36963644 }
36973645
3698- if (nested_vmx_check_vmentry_hw (vcpu )) {
3646+ if (nested_vmx_check_controls_late (vcpu , vmcs12 )) {
36993647 vmx_switch_vmcs (vcpu , & vmx -> vmcs01 );
37003648 return NVMX_VMENTRY_VMFAIL ;
37013649 }
@@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
51645112 /*
51655113 * The only expected VM-instruction error is "VM entry with
51665114 * invalid control field(s)." Anything else indicates a
5167- * problem with L0. And we should never get here with a
5168- * VMFail of any type if early consistency checks are enabled.
5115+ * problem with L0.
51695116 */
51705117 WARN_ON_ONCE (vmcs_read32 (VM_INSTRUCTION_ERROR ) !=
51715118 VMXERR_ENTRY_INVALID_CONTROL_FIELD );
5172- WARN_ON_ONCE (nested_early_check );
5119+
5120+ /* VM-Fail at VM-Entry means KVM missed a consistency check. */
5121+ WARN_ON_ONCE (warn_on_missed_cc );
51735122 }
51745123
51755124 /*
0 commit comments