Merge "Restore the non_hz_full"
authorYunhong Jiang <yunhong.jiang@intel.com>
Fri, 22 Jul 2016 22:01:15 +0000 (22:01 +0000)
committerGerrit Code Review <gerrit@172.30.200.206>
Fri, 22 Jul 2016 22:01:15 +0000 (22:01 +0000)
14 files changed:
ci/build_qemu_rpm_deb/mkcontrol.sh [new file with mode: 0755]
ci/build_qemu_rpm_deb/mkspec [new file with mode: 0755]
ci/build_qemu_rpm_deb/mkversion [new file with mode: 0755]
ci/build_qemu_rpm_deb/qemu_build.sh [new file with mode: 0644]
ci/build_qemu_rpm_deb/qemu_deb_build.sh [new file with mode: 0755]
ci/build_qemu_rpm_deb/qemu_rpm_build.sh [new file with mode: 0755]
fuel-plugin/vagrant/build_fuel_plugin.sh
kernel/arch/x86/configs/opnfv.config
kernel/arch/x86/include/asm/kvm_host.h
kernel/arch/x86/kvm/lapic.c
kernel/arch/x86/kvm/lapic.h
kernel/arch/x86/kvm/trace.h
kernel/arch/x86/kvm/vmx.c
kernel/arch/x86/kvm/x86.c

diff --git a/ci/build_qemu_rpm_deb/mkcontrol.sh b/ci/build_qemu_rpm_deb/mkcontrol.sh
new file mode 100755 (executable)
index 0000000..7eb504a
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/bash
+echo "Package: qemu"
+echo "Version: $1"
+echo "Section: base"
+echo "Priority: optional"
+echo "Architecture: all"
+echo "Maintainer: Intel"
+echo "Description: control file for qemu debian build on centos"
diff --git a/ci/build_qemu_rpm_deb/mkspec b/ci/build_qemu_rpm_deb/mkspec
new file mode 100755 (executable)
index 0000000..0b75a18
--- /dev/null
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+#      Output a simple RPM spec file.
+#
+# starting to output the spec
+
+QEMURELEASE=$1
+
+__QEMURELEASE=`echo $QEMURELEASE | sed -e "s/-/_/g"`
+
+echo $srctree
+echo "Name: qemu"
+echo "Summary: The Linux qemu"
+echo "Version: $__QEMURELEASE"
+# we need to determine the NEXT version number
+# rpm -q will agree
+echo "Release: `sudo sh mkversion`"
+echo "License: GPLv2+ and LGPLv2+ and BSD"
+echo "Group: Development/Tools"
+echo "Vendor: The Linux Community"
+echo "URL: http://www.qemu.org"
+echo "Source: qemu-$__QEMURELEASE.tar.gz"
+echo "%description"
+echo "%prep"
+echo "     "
+echo "%setup -q"
+echo "         "
+echo "%build"
+echo "%_configure"
+echo "make"
+echo "%install"
+echo "rm -rf %{buildroot}"
+echo "make install DESTDIR=%{buildroot}"
+echo "%clean"
+echo "rm -rf %{buildroot}"
+echo "%files"
+echo "%dir"
+echo "/usr/local/share/qemu"
+echo "%doc"
+echo "/usr/local/bin/ivshmem*"
+echo "/usr/local/bin/qemu*"
+echo "/usr/local/libexec/qemu-bridge-helper"
+echo "%changelog"
diff --git a/ci/build_qemu_rpm_deb/mkversion b/ci/build_qemu_rpm_deb/mkversion
new file mode 100755 (executable)
index 0000000..fa4e585
--- /dev/null
@@ -0,0 +1,10 @@
+if [ ! -f .version ]
+then
+    touch .version
+    sudo chmod 777 .version
+    echo 1 > .version
+    echo 1
+else
+    expr 0`cat .version` + 1
+    expr 0`cat .version` + 1 > .version
+fi
diff --git a/ci/build_qemu_rpm_deb/qemu_build.sh b/ci/build_qemu_rpm_deb/qemu_build.sh
new file mode 100644 (file)
index 0000000..a8863c3
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/bash
+qemu_src_dir=qemu
+workspace=/root
+debbuild_dir=$workspace/debbuild
+rpmbuild_dir=$workspace/rpmbuild
+artifact_rpms=$rpmbuild_dir/RPMS
+artifact_dir=$artifact_rpms/x86_64
+scripts_dir=ci/build_qemu_rpm_deb
+output_dir="$1"
+VERSION=`grep -m 1 "VERSION"  ${qemu_src_dir}/config-host.mak | cut -d= -f2-`
+
+usage () {
+    echo "usage: ${0} output_dir"
+    exit 1
+}
+
+if [[ -z "$@" ]]; then
+    usage
+fi
+
+if [ ! -d ${output_dir} -o ! -w ${output_dir} ] ; then
+    echo "${0}: Output directory '${output_dir}' does not exist or cannot be written"
+    exit 1
+fi
+
+if [ ! -d ${qemu_src_dir} ] ; then
+    echo "${0}: Directory '${qemu_src_dir}' does not exist, run this script from the root of kvmfornfv source tree"
+    exit 1
+fi
+
+echo
+echo "Build"
+echo
diff --git a/ci/build_qemu_rpm_deb/qemu_deb_build.sh b/ci/build_qemu_rpm_deb/qemu_deb_build.sh
new file mode 100755 (executable)
index 0000000..7a83018
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/bash
+#Build process for generating qemu debain file.
+
+source ci/build_qemu_rpm_deb/qemu_build.sh
+qemu_deb_build() {
+    sudo mkdir -p $debbuild_dir/qemu-$VERSION
+    sudo cp -r $qemu_src_dir $debbuild_dir/qemu-$VERSION
+    sudo mkdir -p $debbuild_dir/qemu-$VERSION/DEBIAN
+    sudo touch control
+
+#creating control file for debian build.
+    (cd ${scripts_dir}; sudo ./mkcontrol.sh $VERSION > control)
+    sudo mv $scripts_dir/control $debbuild_dir/qemu-$VERSION/DEBIAN/control
+
+#building the qemu debian with control file developed.
+    sudo dpkg-deb --build $debbuild_dir/qemu-$VERSION
+    if [ ${?} -ne 0 ] ; then
+        echo "${0}: qemu build failed"
+        exit 1
+    fi
+}
+
+if [ ! -d ${debbuild_dir} ] ; then
+    echo "creating debbuild directory"
+    sudo mkdir -p $debbuild_dir
+fi
+
+qemu_deb_build
+latest_qemu_build=`sudo ls -rt $debbuild_dir | tail -1`
+sudo cp $debbuild_dir/$latest_qemu_build build_output
diff --git a/ci/build_qemu_rpm_deb/qemu_rpm_build.sh b/ci/build_qemu_rpm_deb/qemu_rpm_build.sh
new file mode 100755 (executable)
index 0000000..a52ee0f
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/bash
+#Build process for Generating qemu rpm.
+
+source ci/build_qemu_rpm_deb/qemu_build.sh
+qemu_rpm_build() {
+    sudo cp  -r ${qemu_src_dir}  ${qemu_src_dir}-$VERSION
+    sudo tar -zcvf ${qemu_src_dir}-$VERSION.tar.gz ${qemu_src_dir}-$VERSION
+    sudo mv ${qemu_src_dir}-$VERSION.tar.gz ${rpmbuild_dir}/SOURCES/
+
+#create a spec file for rpm creation.
+    (cd ${scripts_dir}; ./mkspec $VERSION > qemu.spec)
+    sudo cp ${scripts_dir}/qemu.spec ${rpmbuild_dir}/SPECS/
+
+#build the qemu rpm with spec file developed
+    sudo rpmbuild -ba ${rpmbuild_dir}/SPECS/qemu.spec
+    if [ ${?} -ne 0 ] ; then
+        echo "${0}: qemu build failed"
+        exit 1
+    fi
+    sudo rm -rf ${qemu_src_dir}-$VERSION
+    sudo rm -rf ${rpmbuild_dir}/SOURCES/${qemu_src_dir}-$VERSION.tar.gz
+}
+
+if [ ! -d ${rpmbuild_dir} ] ; then
+    sudo yum install rpm-build -y
+    mkdir -p ~/rpmbuild/{BUILD,RPMS,SOURCES,SPECS,SRPMS}
+    sudo mv rpmbuild $workspace
+fi
+
+qemu_rpm_build
+latest_qemu_build=`ls -rt $artifact_dir | grep qemu-2.6* | tail -1`
+sudo cp $artifact_dir/$latest_qemu_build build_output
index b18ae17..e315b11 100755 (executable)
@@ -1,13 +1,8 @@
 #!/bin/bash
 sudo apt-get update -y
-sudo apt-get install createrepo rpm dpkg-dev -y
-sudo apt-get install python-setuptools -y
-sudo apt-get install python-pip -y
-sudo easy_install pip
-sudo pip install fuel-plugin-builder
-sudo apt-get install ruby -y
-sudo gem install rubygems-update
+sudo apt-get install -y ruby-dev rubygems-integration python-pip rpm createrepo dpkg-dev
 sudo gem install fpm
+sudo pip install fuel-plugin-builder
 sudo apt-get install docker.io -y
 cd /home/vagrant
 # Will build fuel-plugin-kvm in guest VM local directory, not change host
@@ -15,5 +10,4 @@ cp -r /kvmfornfv .
 cd kvmfornfv/fuel-plugin
 fpb --debug --build .
 # Copy the built fuel-plugin-kvm back to the host
-rm /kvmfornfv/fuel-plugin/fuel-plugin-kvm*.rpm
-cp fuel-plugin-kvm*.rpm /kvmfornfv/fuel-plugin/.
+cp *.rpm /vagrant
index 2d6d1cc..b623b0c 100644 (file)
@@ -885,7 +885,7 @@ CONFIG_NETFILTER_XTABLES=m
 # Xtables combined modules
 #
 CONFIG_NETFILTER_XT_MARK=m
-# CONFIG_NETFILTER_XT_CONNMARK is not set
+CONFIG_NETFILTER_XT_CONNMARK=m
 CONFIG_NETFILTER_XT_SET=m
 
 #
@@ -894,7 +894,7 @@ CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_AUDIT=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 # CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set
-# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
 CONFIG_NETFILTER_XT_TARGET_CT=m
 # CONFIG_NETFILTER_XT_TARGET_DSCP is not set
@@ -929,7 +929,7 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 # CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set
 # CONFIG_NETFILTER_XT_MATCH_CONNLABEL is not set
 # CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set
-# CONFIG_NETFILTER_XT_MATCH_CONNMARK is not set
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 # CONFIG_NETFILTER_XT_MATCH_CPU is not set
 # CONFIG_NETFILTER_XT_MATCH_DCCP is not set
@@ -1227,6 +1227,7 @@ CONFIG_NET_CLS_ACT=y
 # CONFIG_NET_ACT_CSUM is not set
 # CONFIG_NET_ACT_VLAN is not set
 # CONFIG_NET_ACT_BPF is not set
+# CONFIG_NET_ACT_CONNMARK is not set
 CONFIG_NET_SCH_FIFO=y
 # CONFIG_DCB is not set
 CONFIG_DNS_RESOLVER=y
index 30cfd64..fe68e83 100644 (file)
@@ -911,6 +911,9 @@ struct kvm_x86_ops {
        void (*post_block)(struct kvm_vcpu *vcpu);
        int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
                              uint32_t guest_irq, bool set);
+
+       int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+       void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -980,6 +983,8 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
index 20d9e9f..3d1b170 100644 (file)
@@ -1258,6 +1258,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
                __delay(tsc_deadline - guest_tsc);
 }
 
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       u64 ns = 0;
+       ktime_t expire;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+       unsigned long flags;
+       ktime_t now;
+
+       if (unlikely(!tscdeadline || !this_tsc_khz))
+               return;
+
+       local_irq_save(flags);
+
+       now = apic->lapic_timer.timer.base->get_time();
+       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       if (likely(tscdeadline > guest_tsc)) {
+               ns = (tscdeadline - guest_tsc) * 1000000ULL;
+               do_div(ns, this_tsc_khz);
+               expire = ktime_add_ns(now, ns);
+               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+               hrtimer_start(&apic->lapic_timer.timer,
+                               expire, HRTIMER_MODE_ABS_PINNED);
+       } else
+               apic_timer_expired(apic);
+
+       local_irq_restore(flags);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+       apic->lapic_timer.hv_timer_in_use = false;
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+       WARN_ON(swait_active(&vcpu->wq));
+       cancel_hv_tscdeadline(apic);
+       apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+       if (atomic_read(&apic->lapic_timer.pending) ||
+               kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+               if (apic->lapic_timer.hv_timer_in_use)
+                       cancel_hv_tscdeadline(apic);
+       } else {
+               apic->lapic_timer.hv_timer_in_use = true;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+
+               /* In case the sw timer triggered in the window */
+               if (atomic_read(&apic->lapic_timer.pending))
+                       cancel_hv_tscdeadline(apic);
+       }
+       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+                       apic->lapic_timer.hv_timer_in_use);
+       return apic->lapic_timer.hv_timer_in_use;
+}
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+       if (apic_lvtt_tscdeadline(apic))
+               start_hv_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       /* Possibly the TSC deadline timer is not enabled yet */
+       if (!apic->lapic_timer.hv_timer_in_use)
+               return;
+
+       cancel_hv_tscdeadline(apic);
+
+       if (atomic_read(&apic->lapic_timer.pending))
+               return;
+
+       start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
        ktime_t now;
@@ -1304,32 +1406,8 @@ static void start_apic_timer(struct kvm_lapic *apic)
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
        } else if (apic_lvtt_tscdeadline(apic)) {
-               /* lapic timer in tsc deadline mode */
-               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-               u64 ns = 0;
-               ktime_t expire;
-               struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-               unsigned long flags;
-
-               if (unlikely(!tscdeadline || !this_tsc_khz))
-                       return;
-
-               local_irq_save(flags);
-
-               now = apic->lapic_timer.timer.base->get_time();
-               guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-               if (likely(tscdeadline > guest_tsc)) {
-                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, this_tsc_khz);
-                       expire = ktime_add_ns(now, ns);
-                       expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-                       hrtimer_start(&apic->lapic_timer.timer,
-                                     expire, HRTIMER_MODE_ABS);
-               } else
-                       apic_timer_expired(apic);
-
-               local_irq_restore(flags);
+               if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+                       start_sw_tscdeadline(apic);
        }
 }
 
index fde8e35..640ad27 100644 (file)
@@ -16,6 +16,7 @@ struct kvm_timer {
        u64 tscdeadline;
        u64 expired_tscdeadline;
        atomic_t pending;                       /* accumulated triggered timers */
+       bool hv_timer_in_use;
 };
 
 struct kvm_lapic {
@@ -170,4 +171,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 #endif
index ab9ae67..b41f7a0 100644 (file)
@@ -1025,6 +1025,21 @@ TRACE_EVENT(kvm_pi_irte_update,
                  __entry->pi_desc_addr)
 );
 
+TRACE_EVENT(kvm_hv_timer_state,
+               TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+               TP_ARGS(vcpu_id, hv_timer_in_use),
+               TP_STRUCT__entry(
+                       __field(unsigned int, vcpu_id)
+                       __field(unsigned int, hv_timer_in_use)
+                       ),
+               TP_fast_assign(
+                       __entry->vcpu_id = vcpu_id;
+                       __entry->hv_timer_in_use = hv_timer_in_use;
+                       ),
+               TP_printk("vcpu_id %x hv_timer %x\n",
+                       __entry->vcpu_id,
+                       __entry->hv_timer_in_use)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index 0958fa2..a722f72 100644 (file)
@@ -109,6 +109,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
@@ -596,6 +603,9 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
 
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+
        u64 current_tsc_ratio;
 };
 
@@ -1043,6 +1053,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+       u32 eax = cpuid_eax(0x00000001), i;
+
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+
+       return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
        return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -3209,11 +3271,15 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                return -EIO;
 
        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
 
+       if (cpu_has_broken_vmx_preemption_timer())
+               _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
        if (!(_cpu_based_2nd_exec_control &
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
                !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
@@ -4683,6 +4749,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
        if (!vmx_cpu_uses_apicv(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
 }
 
@@ -4781,6 +4849,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6292,6 +6361,17 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
+
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
        return alloc_kvm_area();
@@ -7460,6 +7540,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7511,6 +7597,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7814,6 +7901,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        case EXIT_REASON_PCOMMIT:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return false;
        default:
                return true;
        }
@@ -8510,6 +8599,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8556,6 +8665,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       vmx_arm_hv_timer(vcpu);
+
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -9520,9 +9631,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+       /* Preemption timer setting is only taken from vmcs01.  */
        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       if (vmx->hv_deadline_tsc == -1)
+               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+       /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
                /*
                 * Note that we use L0's vector here and in
@@ -10451,8 +10567,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        load_vmcs12_host_state(vcpu, vmcs12);
 
-       /* Update TSC_OFFSET if TSC was changed while L2 ran */
+       /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (vmx->hv_deadline_tsc == -1)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       else
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;
@@ -10532,6 +10654,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
        return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+{
+       u64 low = a << shift, high = a >> (64 - shift);
+
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+
+       return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc();
+       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
        if (ple_gap)
@@ -10576,7 +10756,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  *   this case, return 1, otherwise, return 0.
  *
  */
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
        unsigned int dest;
@@ -10642,7 +10822,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+       if (pi_pre_block(vcpu))
+               return 1;
+
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+
+       return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
 {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
        struct pi_desc old, new;
@@ -10683,6 +10874,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
        }
 }
 
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+
+       pi_post_block(vcpu);
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *
@@ -10878,6 +11077,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .pmu_ops = &intel_pmu_ops,
 
        .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
 };
 
 static int __init vmx_init(void)
index 27419ba..c7695ce 100644 (file)
@@ -113,7 +113,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -2718,6 +2719,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                                rdtsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
+
+               if (kvm_lapic_hv_timer_in_use(vcpu) &&
+                               kvm_x86_ops->set_hv_timer(vcpu,
+                                       kvm_get_lapic_tscdeadline_msr(vcpu)))
+                       kvm_lapic_switch_to_sw_timer(vcpu);
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);