kvm: vmx: hook preemption timer support 65/17065/1
authorYunhong Jiang <yunhong.jiang@intel.com>
Mon, 13 Jun 2016 21:19:59 +0000 (14:19 -0700)
committerYunhong Jiang <yunhong.jiang@linux.intel.com>
Mon, 18 Jul 2016 15:07:29 +0000 (08:07 -0700)
Hook the VMX preemption timer to the "hv timer" functionality added
by the previous patch.  This includes: checking if the feature is
supported, if the feature is broken on the CPU, the hooks to
setup/clean the VMX preemption timer, arming the timer on vmentry
and handling the vmexit.

A module parameter states if the VMX preemption timer should be
utilized.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
[Move hv_deadline_tsc to struct vcpu_vmx, use -1 as the "unset" value.
 Put all VMX bits here.  Enable it by default #yolo. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Change-Id: Icb8e0b853eedce3d52c394e510fa14d2cdd432e9
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
kernel/arch/x86/include/asm/kvm_host.h
kernel/arch/x86/kvm/vmx.c
kernel/arch/x86/kvm/x86.c

index 21637c9..fe68e83 100644 (file)
@@ -983,6 +983,8 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
index 0732111..e378975 100644 (file)
@@ -109,6 +109,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
@@ -596,6 +603,9 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
 
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+
        u64 current_tsc_ratio;
 };
 
@@ -1043,6 +1053,61 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+       u32 eax = cpuid_eax(0x00000001), i;
+
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+
+       return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+       if (cpu_has_broken_vmx_preemption_timer())
+               return false;
+
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
        return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -3209,7 +3274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                return -EIO;
 
        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
@@ -4683,6 +4749,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
        if (!vmx_cpu_uses_apicv(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
 }
 
@@ -4781,6 +4849,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6292,6 +6361,17 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
+
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
        return alloc_kvm_area();
@@ -7460,6 +7540,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7511,6 +7597,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -8510,6 +8597,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8556,6 +8663,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       vmx_arm_hv_timer(vcpu);
+
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -10532,6 +10641,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
        return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+{
+       u64 low = a << shift, high = a >> (64 - shift);
+
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+
+       return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc(), delta_tsc;
+
+       delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
        if (ple_gap)
@@ -10647,6 +10814,9 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
        if (pi_pre_block(vcpu))
                return 1;
 
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+
        return 0;
 }
 
@@ -10693,6 +10863,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+
        pi_post_block(vcpu);
 }
 
@@ -10891,6 +11064,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .pmu_ops = &intel_pmu_ops,
 
        .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
 };
 
 static int __init vmx_init(void)
index e3e1a8c..c7695ce 100644 (file)
@@ -113,7 +113,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;