From ae06ae07a93149c53316798167f1fad4e3d9c3b3 Mon Sep 17 00:00:00 2001
From: davidjchou <david.j.chou@intel.com>
Date: Wed, 13 Jul 2016 15:55:04 -0700
Subject: [PATCH 01/16] Fix the failure of launching instances in Horizon

In the Mitaka version of OpenStack, Neutron agent uses connmark
module to add rules into iptable. Enable this module in
opnfv.config to fix the failure of launching instances in Horizon.

Upstream status - NA

Change-Id: I31f346198a5ba50e83d10210bb5b5e10baf825f3
Signed-off-by: davidjchou <david.j.chou@intel.com>
---
 kernel/arch/x86/configs/opnfv.config | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/arch/x86/configs/opnfv.config b/kernel/arch/x86/configs/opnfv.config
index 2d6d1cc4e..b623b0cd4 100644
--- a/kernel/arch/x86/configs/opnfv.config
+++ b/kernel/arch/x86/configs/opnfv.config
@@ -885,7 +885,7 @@ CONFIG_NETFILTER_XTABLES=m
 # Xtables combined modules
 #
 CONFIG_NETFILTER_XT_MARK=m
-# CONFIG_NETFILTER_XT_CONNMARK is not set
+CONFIG_NETFILTER_XT_CONNMARK=m
 CONFIG_NETFILTER_XT_SET=m
 
 #
@@ -894,7 +894,7 @@ CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_AUDIT=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 # CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set
-# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
 CONFIG_NETFILTER_XT_TARGET_CT=m
 # CONFIG_NETFILTER_XT_TARGET_DSCP is not set
@@ -929,7 +929,7 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 # CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set
 # CONFIG_NETFILTER_XT_MATCH_CONNLABEL is not set
 # CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set
-# CONFIG_NETFILTER_XT_MATCH_CONNMARK is not set
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 # CONFIG_NETFILTER_XT_MATCH_CPU is not set
 # CONFIG_NETFILTER_XT_MATCH_DCCP is not set
@@ -1227,6 +1227,7 @@ CONFIG_NET_CLS_ACT=y
 # CONFIG_NET_ACT_CSUM is not set
 # CONFIG_NET_ACT_VLAN is not set
 # CONFIG_NET_ACT_BPF is not set
+# CONFIG_NET_ACT_CONNMARK is not set
 CONFIG_NET_SCH_FIFO=y
 # CONFIG_DCB is not set
 CONFIG_DNS_RESOLVER=y
-- 
2.16.6


From ab9c81f473b3feff08f1e51428fbaf6a93d4896a Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:20:00 -0700
Subject: [PATCH 02/16] kvm: lapic: separate start_sw_tscdeadline from
 start_apic_timer

The function to start the tsc deadline timer virtualization will be used
also by the pre_block hook when we use the preemption timer; change it
to a separate function. No logic changes.

upstream-status: backport

Change-Id: Ie2fc19108c3252f8a299b17aba16c14aa8d31ae8
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index 20d9e9fb3..1e1e7eb8e 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -1258,6 +1258,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 		__delay(tsc_deadline - guest_tsc);
 }
 
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+	u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+	u64 ns = 0;
+	ktime_t expire;
+	struct kvm_vcpu *vcpu = apic->vcpu;
+	unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+	unsigned long flags;
+	ktime_t now;
+
+	if (unlikely(!tscdeadline || !this_tsc_khz))
+		return;
+
+	local_irq_save(flags);
+
+	now = apic->lapic_timer.timer.base->get_time();
+	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+	if (likely(tscdeadline > guest_tsc)) {
+		ns = (tscdeadline - guest_tsc) * 1000000ULL;
+		do_div(ns, this_tsc_khz);
+		expire = ktime_add_ns(now, ns);
+		expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+		hrtimer_start(&apic->lapic_timer.timer,
+				expire, HRTIMER_MODE_ABS_PINNED);
+	} else
+		apic_timer_expired(apic);
+
+	local_irq_restore(flags);
+}
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
@@ -1304,32 +1334,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		/* lapic timer in tsc deadline mode */
-		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-		u64 ns = 0;
-		ktime_t expire;
-		struct kvm_vcpu *vcpu = apic->vcpu;
-		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-		unsigned long flags;
-
-		if (unlikely(!tscdeadline || !this_tsc_khz))
-			return;
-
-		local_irq_save(flags);
-
-		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-		if (likely(tscdeadline > guest_tsc)) {
-			ns = (tscdeadline - guest_tsc) * 1000000ULL;
-			do_div(ns, this_tsc_khz);
-			expire = ktime_add_ns(now, ns);
-			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-			hrtimer_start(&apic->lapic_timer.timer,
-				      expire, HRTIMER_MODE_ABS);
-		} else
-			apic_timer_expired(apic);
-
-		local_irq_restore(flags);
+		start_sw_tscdeadline(apic);
 	}
 }
 
-- 
2.16.6


From ebcdab040619575a47ad19b18e7ed5c38d287336 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:20:01 -0700
Subject: [PATCH 03/16] KVM: x86: support using the vmx preemption timer for
 tsc deadline timer

The VMX preemption timer can be used to virtualize the TSC deadline timer.
The VMX preemption timer is armed when the vCPU is running, and a VMExit
will happen if the virtual TSC deadline timer expires.

When the vCPU thread is blocked because of HLT, KVM will switch to use
an hrtimer, and then go back to the VMX preemption timer when the vCPU
thread is unblocked.

This solution avoids the complex OS's hrtimer system, and the host
timer interrupt handling cost, replacing them with a little math
(for guest->host TSC and host TSC->preemption timer conversion)
and a cheaper VMexit.  This benefits latency for isolated pCPUs.

[A word about performance... Yunhong reported a 30% reduction in average
 latency from cyclictest.  I made a similar test with tscdeadline_latency
 from kvm-unit-tests, and measured

 - ~20 clock cycles loss (out of ~3200, so less than 1% but still
   statistically significant) in the worst case where the test halts
   just after programming the TSC deadline timer

 - ~800 clock cycles gain (25% reduction in latency) in the best case
   where the test busy waits.

 I removed the VMX bits from Yunhong's patch, to concentrate them in the
 next patch - Paolo]

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: I4aa1ecfa3463d1cbfb317511b45d2074b33d9b6f
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/include/asm/kvm_host.h |  3 ++
 kernel/arch/x86/kvm/lapic.c            | 73 +++++++++++++++++++++++++++++++++-
 kernel/arch/x86/kvm/lapic.h            |  5 +++
 kernel/arch/x86/kvm/trace.h            | 15 +++++++
 kernel/arch/x86/kvm/x86.c              |  5 +++
 5 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/kernel/arch/x86/include/asm/kvm_host.h b/kernel/arch/x86/include/asm/kvm_host.h
index 30cfd6429..21637c9e9 100644
--- a/kernel/arch/x86/include/asm/kvm_host.h
+++ b/kernel/arch/x86/include/asm/kvm_host.h
@@ -911,6 +911,9 @@ struct kvm_x86_ops {
 	void (*post_block)(struct kvm_vcpu *vcpu);
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
+
+	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index 1e1e7eb8e..701f91234 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -1288,6 +1288,68 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	WARN_ON(swait_active(&vcpu->wq));
+	kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+	apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+	if (apic_lvtt_tscdeadline(apic) &&
+	    !atomic_read(&apic->lapic_timer.pending)) {
+		u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+		if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) {
+			apic->lapic_timer.hv_timer_in_use = true;
+			hrtimer_cancel(&apic->lapic_timer.timer);
+
+			/* In case the sw timer triggered in the window */
+			if (atomic_read(&apic->lapic_timer.pending)) {
+				apic->lapic_timer.hv_timer_in_use = false;
+				kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+			}
+		}
+		trace_kvm_hv_timer_state(vcpu->vcpu_id,
+				apic->lapic_timer.hv_timer_in_use);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	/* Possibly the TSC deadline timer is not enabled yet */
+	if (!apic->lapic_timer.hv_timer_in_use)
+		return;
+
+	kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+
+	if (atomic_read(&apic->lapic_timer.pending))
+		return;
+
+	start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
@@ -1334,7 +1396,16 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		start_sw_tscdeadline(apic);
+		/* lapic timer in tsc deadline mode */
+		u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+		if (kvm_x86_ops->set_hv_timer &&
+		    !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+			apic->lapic_timer.hv_timer_in_use = true;
+			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+					apic->lapic_timer.hv_timer_in_use);
+		} else
+			start_sw_tscdeadline(apic);
 	}
 }
 
diff --git a/kernel/arch/x86/kvm/lapic.h b/kernel/arch/x86/kvm/lapic.h
index fde8e35d5..640ad275b 100644
--- a/kernel/arch/x86/kvm/lapic.h
+++ b/kernel/arch/x86/kvm/lapic.h
@@ -16,6 +16,7 @@ struct kvm_timer {
 	u64 tscdeadline;
 	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
+	bool hv_timer_in_use;
 };
 
 struct kvm_lapic {
@@ -170,4 +171,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 #endif
diff --git a/kernel/arch/x86/kvm/trace.h b/kernel/arch/x86/kvm/trace.h
index ab9ae67a8..b41f7a013 100644
--- a/kernel/arch/x86/kvm/trace.h
+++ b/kernel/arch/x86/kvm/trace.h
@@ -1025,6 +1025,21 @@ TRACE_EVENT(kvm_pi_irte_update,
 		  __entry->pi_desc_addr)
 );
 
+TRACE_EVENT(kvm_hv_timer_state,
+		TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+		TP_ARGS(vcpu_id, hv_timer_in_use),
+		TP_STRUCT__entry(
+			__field(unsigned int, vcpu_id)
+			__field(unsigned int, hv_timer_in_use)
+			),
+		TP_fast_assign(
+			__entry->vcpu_id = vcpu_id;
+			__entry->hv_timer_in_use = hv_timer_in_use;
+			),
+		TP_printk("vcpu_id %x hv_timer %x\n",
+			__entry->vcpu_id,
+			__entry->hv_timer_in_use)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index 27419ba5a..e3e1a8c1b 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -2718,6 +2718,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
+
+		if (kvm_lapic_hv_timer_in_use(vcpu) &&
+				kvm_x86_ops->set_hv_timer(vcpu,
+					kvm_get_lapic_tscdeadline_msr(vcpu)))
+			kvm_lapic_switch_to_sw_timer(vcpu);
 		if (check_tsc_unstable()) {
 			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
-- 
2.16.6


From fd047d7c73bfddb06d19aa4ca182c5c0f72a3304 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:19:58 -0700
Subject: [PATCH 04/16] kvm: vmx: rename vmx_pre/post_block to
 pi_pre/post_block

Prepare to switch from preemption timer to hrtimer in the
vmx_pre/post_block. Current functions are only for posted interrupt,
rename them accordingly.

upstream-status: backport

Change-Id: Ie1dde9be21deeb661de095e07d6c29bcba2e7d73
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/vmx.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 0958fa2b7..073211105 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -10576,7 +10576,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  *   this case, return 1, otherwise, return 0.
  *
  */
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
 	unsigned int dest;
@@ -10642,7 +10642,15 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+	if (pi_pre_block(vcpu))
+		return 1;
+
+	return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
@@ -10683,6 +10691,11 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+	pi_post_block(vcpu);
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *
-- 
2.16.6


From 09c7dd2e0b05c5bc2def37e525a98e6b344d56f5 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Mon, 13 Jun 2016 14:19:59 -0700
Subject: [PATCH 05/16] kvm: vmx: hook preemption timer support

Hook the VMX preemption timer to the "hv timer" functionality added
by the previous patch.  This includes: checking if the feature is
supported, if the feature is broken on the CPU, the hooks to
setup/clean the VMX preemption timer, arming the timer on vmentry
and handling the vmexit.

A module parameter states if the VMX preemption timer should be
utilized.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
[Move hv_deadline_tsc to struct vcpu_vmx, use -1 as the "unset" value.
 Put all VMX bits here.  Enable it by default #yolo. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: Icb8e0b853eedce3d52c394e510fa14d2cdd432e9
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/include/asm/kvm_host.h |   2 +
 kernel/arch/x86/kvm/vmx.c              | 180 ++++++++++++++++++++++++++++++++-
 kernel/arch/x86/kvm/x86.c              |   3 +-
 3 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/kernel/arch/x86/include/asm/kvm_host.h b/kernel/arch/x86/include/asm/kvm_host.h
index 21637c9e9..fe68e836e 100644
--- a/kernel/arch/x86/include/asm/kvm_host.h
+++ b/kernel/arch/x86/include/asm/kvm_host.h
@@ -983,6 +983,8 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 073211105..e378975f5 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -109,6 +109,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON						\
@@ -596,6 +603,9 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM		512
 	struct page *pml_pg;
 
+	/* apic deadline value in host tsc */
+	u64 hv_deadline_tsc;
+
 	u64 current_tsc_ratio;
 };
 
@@ -1043,6 +1053,61 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+	u32 eax = cpuid_eax(0x00000001), i;
+
+	/* Clear the reserved bits */
+	eax &= ~(0x3U << 14 | 0xfU << 28);
+	for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+		if (eax == vmx_preemption_cpu_tfms[i])
+			return true;
+
+	return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+	if (cpu_has_broken_vmx_preemption_timer())
+		return false;
+
+	return vmcs_config.pin_based_exec_ctrl &
+		PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
 	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -3209,7 +3274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		return -EIO;
 
 	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+		 PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
 				&_pin_based_exec_control) < 0)
 		return -EIO;
@@ -4683,6 +4749,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
 	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+	/* Enable the preemption timer dynamically */
+	pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 	return pin_based_exec_ctrl;
 }
 
@@ -4781,6 +4849,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	/* Control */
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+	vmx->hv_deadline_tsc = -1;
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6292,6 +6361,17 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 
+	if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+		u64 vmx_msr;
+
+		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+		cpu_preemption_timer_multi =
+			 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+	} else {
+		kvm_x86_ops->set_hv_timer = NULL;
+		kvm_x86_ops->cancel_hv_timer = NULL;
+	}
+
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
 	return alloc_kvm_area();
@@ -7460,6 +7540,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	kvm_lapic_expired_hv_timer(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7511,6 +7597,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -8510,6 +8597,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 					msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 tscl;
+	u32 delta_tsc;
+
+	if (vmx->hv_deadline_tsc == -1)
+		return;
+
+	tscl = rdtsc();
+	if (vmx->hv_deadline_tsc > tscl)
+		/* sure to be 32 bit only because checked on set_hv_timer */
+		delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+			cpu_preemption_timer_multi);
+	else
+		delta_tsc = 0;
+
+	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8556,6 +8663,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
 
+	vmx_arm_hv_timer(vcpu);
+
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
@@ -10532,6 +10641,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 	return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+				  u64 divisor, u64 *result)
+{
+	u64 low = a << shift, high = a >> (64 - shift);
+
+	/* To avoid the overflow on divq */
+	if (high >= divisor)
+		return 1;
+
+	/* Low hold the result, high hold rem which is discarded */
+	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+	    "rm" (divisor), "0" (low), "1" (high));
+	*result = low;
+
+	return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 tscl = rdtsc(), delta_tsc;
+
+	delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+
+	/* Convert to host delta tsc if tsc scaling is enabled */
+	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+			u64_shl_div_u64(delta_tsc,
+				kvm_tsc_scaling_ratio_frac_bits,
+				vcpu->arch.tsc_scaling_ratio,
+				&delta_tsc))
+		return -ERANGE;
+
+	/*
+	 * If the delta tsc can't fit in the 32 bit after the multi shift,
+	 * we can't use the preemption timer.
+	 * It's possible that it fits on later vmentries, but checking
+	 * on every vmentry is costly so we just use an hrtimer.
+	 */
+	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+		return -ERANGE;
+
+	vmx->hv_deadline_tsc = tscl + delta_tsc;
+	vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+			PIN_BASED_VMX_PREEMPTION_TIMER);
+	return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	vmx->hv_deadline_tsc = -1;
+	vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+			PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
 	if (ple_gap)
@@ -10647,6 +10814,9 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 	if (pi_pre_block(vcpu))
 		return 1;
 
+	if (kvm_lapic_hv_timer_in_use(vcpu))
+		kvm_lapic_switch_to_sw_timer(vcpu);
+
 	return 0;
 }
 
@@ -10693,6 +10863,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
+	if (kvm_x86_ops->set_hv_timer)
+		kvm_lapic_switch_to_hv_timer(vcpu);
+
 	pi_post_block(vcpu);
 }
 
@@ -10891,6 +11064,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.pmu_ops = &intel_pmu_ops,
 
 	.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+	.set_hv_timer = vmx_set_hv_timer,
+	.cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
 };
 
 static int __init vmx_init(void)
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index e3e1a8c1b..c7695cea4 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -113,7 +113,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
-- 
2.16.6


From 6443c73ab06239ab4b1bc0511d9b78f7122f477d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Jun 2016 15:08:01 +0200
Subject: [PATCH 06/16] KVM: vmx: fix underflow in TSC deadline calculation

If the TSC deadline timer is programmed really close to the deadline or
even in the past, the computation in vmx_set_hv_timer can underflow and
cause delta_tsc to be set to a huge value.  This generally results
in vmx_set_hv_timer returning -ERANGE, but we can fix it by limiting
delta_tsc to be positive or zero.

Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: I12eea18c3ec648dbf782d7754b7b574d7d6aa92c
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index e378975f5..5da019be6 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -10663,9 +10663,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl = rdtsc(), delta_tsc;
-
-	delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+	u64 tscl = rdtsc();
+	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
-- 
2.16.6


From 9b19dab47067a65808e46008e9b25929e5bed737 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 30 Jun 2016 08:52:49 +0800
Subject: [PATCH 07/16] KVM: x86: introduce cancel_hv_tscdeadline
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Introduce cancel_hv_tscdeadline() to encapsulate preemption
timer cancel stuff.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: Icc038176cbf361a9ecdf37ed3425108db57617f2
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/lapic.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index 701f91234..e19426843 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -1294,14 +1294,19 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+}
+
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
 	WARN_ON(swait_active(&vcpu->wq));
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 	apic_timer_expired(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
@@ -1321,10 +1326,8 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 			hrtimer_cancel(&apic->lapic_timer.timer);
 
 			/* In case the sw timer triggered in the window */
-			if (atomic_read(&apic->lapic_timer.pending)) {
-				apic->lapic_timer.hv_timer_in_use = false;
-				kvm_x86_ops->cancel_hv_timer(apic->vcpu);
-			}
+			if (atomic_read(&apic->lapic_timer.pending))
+				cancel_hv_tscdeadline(apic);
 		}
 		trace_kvm_hv_timer_state(vcpu->vcpu_id,
 				apic->lapic_timer.hv_timer_in_use);
@@ -1340,8 +1343,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	if (!apic->lapic_timer.hv_timer_in_use)
 		return;
 
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
-- 
2.16.6


From 4304209a4fdfc9b1b8be8f0937ba6a7b7e3557a0 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <kernellwp@gmail.com>
Date: Tue, 28 Jun 2016 14:54:19 +0800
Subject: [PATCH 08/16] KVM: vmx: fix missed cancellation of TSC deadline timer
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

INFO: rcu_sched detected stalls on CPUs/tasks:
 1-...: (11800 GPs behind) idle=45d/140000000000000/0 softirq=0/0 fqs=21663
 (detected by 0, t=65016 jiffies, g=11500, c=11499, q=719)
Task dump for CPU 1:
qemu-system-x86 R  running task        0  3529   3525 0x00080808
 ffff8802021791a0 ffff880212895040 0000000000000001 00007f1c2c00db40
 ffff8801dd20fcd3 ffffc90002b98000 ffff8801dd20fc88 ffff8801dd20fcf8
 0000000000000286 ffff8801dd2ac538 ffff8801dd20fcc0 ffffffffc06949c9
Call Trace:
? kvm_write_guest_cached+0xb9/0x160 [kvm]
? __delay+0xf/0x20
? wait_lapic_expire+0x14a/0x200 [kvm]
? kvm_arch_vcpu_ioctl_run+0xcbe/0x1b00 [kvm]
? kvm_arch_vcpu_ioctl_run+0xe34/0x1b00 [kvm]
? kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm]
? __fget+0x5/0x210
? do_vfs_ioctl+0x96/0x6a0
? __fget_light+0x2a/0x90
? SyS_ioctl+0x79/0x90
? do_syscall_64+0x7c/0x1e0
? entry_SYSCALL64_slow_path+0x25/0x25

This can be reproduced readily by running a full dynticks guest(since hrtimer
in guest is heavily used) w/ lapic_timer_advance disabled.

If fail to program hardware preemption timer, we will fallback to hrtimer based
method, however, a previous programmed preemption timer miss to cancel in this
scenario which results in one hardware preemption timer and one hrtimer emulated
tsc deadline timer run simultaneously. So sometimes the target guest deadline
tsc is earlier than guest tsc, which leads to the computation in vmx_set_hv_timer
can underflow and cause delta_tsc to be set a huge value, then host soft lockup
as above.

This patch fix it by cancelling the previous programmed preemption timer if there
is once we failed to program the new preemption timer and fallback to hrtimer
based method.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: I8a2decefab743aecdfab676fb9267324bf42b848
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/lapic.c | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index e19426843..3d1b17068 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -1311,27 +1311,35 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+	if (atomic_read(&apic->lapic_timer.pending) ||
+		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+		if (apic->lapic_timer.hv_timer_in_use)
+			cancel_hv_tscdeadline(apic);
+	} else {
+		apic->lapic_timer.hv_timer_in_use = true;
+		hrtimer_cancel(&apic->lapic_timer.timer);
+
+		/* In case the sw timer triggered in the window */
+		if (atomic_read(&apic->lapic_timer.pending))
+			cancel_hv_tscdeadline(apic);
+	}
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+			apic->lapic_timer.hv_timer_in_use);
+	return apic->lapic_timer.hv_timer_in_use;
+}
+
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(apic->lapic_timer.hv_timer_in_use);
 
-	if (apic_lvtt_tscdeadline(apic) &&
-	    !atomic_read(&apic->lapic_timer.pending)) {
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			hrtimer_cancel(&apic->lapic_timer.timer);
-
-			/* In case the sw timer triggered in the window */
-			if (atomic_read(&apic->lapic_timer.pending))
-				cancel_hv_tscdeadline(apic);
-		}
-		trace_kvm_hv_timer_state(vcpu->vcpu_id,
-				apic->lapic_timer.hv_timer_in_use);
-	}
+	if (apic_lvtt_tscdeadline(apic))
+		start_hv_tscdeadline(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1398,15 +1406,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		/* lapic timer in tsc deadline mode */
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (kvm_x86_ops->set_hv_timer &&
-		    !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
-					apic->lapic_timer.hv_timer_in_use);
-		} else
+		if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
 			start_sw_tscdeadline(apic);
 	}
 }
-- 
2.16.6


From c81c5f9e9856d3149ec81c099dfddca212a50e54 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Jul 2016 11:53:38 +0200
Subject: [PATCH 09/16] KVM: VMX: reflect broken preemption timer in
 vmcs_config

Simplify cpu_has_vmx_preemption_timer.  This is consistent with the
rest of setup_vmcs_config and preparatory for the next patch.

Tested-by: Wanpeng Li <kernellwp@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: I3b33a881c5e47d5d3046e28374d0b0ca363ffad7
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 5da019be6..e55417710 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -1101,9 +1101,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 
 static inline bool cpu_has_vmx_preemption_timer(void)
 {
-	if (cpu_has_broken_vmx_preemption_timer())
-		return false;
-
 	return vmcs_config.pin_based_exec_ctrl &
 		PIN_BASED_VMX_PREEMPTION_TIMER;
 }
@@ -3280,6 +3277,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
+	if (cpu_has_broken_vmx_preemption_timer())
+		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
 	if (!(_cpu_based_2nd_exec_control &
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
 		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
-- 
2.16.6


From ae6912551e1bc31e4b846221b7cf5ec2aac76104 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 6 Jul 2016 18:29:58 +0800
Subject: [PATCH 10/16] KVM: nVMX: avoid incorrect preemption timer vmexit in
 nested guest
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The preemption timer for nested VMX is emulated by hrtimer which is started on L2
entry, stopped on L2 exit and evaluated via the check_nested_events hook. However,
nested_vmx_exit_handled is always returning true for preemption timer vmexit.  Then,
the L1 preemption timer vmexit is captured and be treated as a L2 preemption
timer vmexit, causing NULL pointer dereferences or worse in the L1 guest's
vmexit handler:

    BUG: unable to handle kernel NULL pointer dereference at           (null)
    IP: [<          (null)>]           (null)
    PGD 0
    Oops: 0010 [#1] SMP
    Call Trace:
     ? kvm_lapic_expired_hv_timer+0x47/0x90 [kvm]
     handle_preemption_timer+0xe/0x20 [kvm_intel]
     vmx_handle_exit+0x169/0x15a0 [kvm_intel]
     ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm]
     kvm_arch_vcpu_ioctl_run+0xdee/0x19d0 [kvm]
     ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm]
     ? vcpu_load+0x1c/0x60 [kvm]
     ? kvm_arch_vcpu_load+0x57/0x260 [kvm]
     kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm]
     do_vfs_ioctl+0x96/0x6a0
     ? __fget_light+0x2a/0x90
     SyS_ioctl+0x79/0x90
     do_syscall_64+0x68/0x180
     entry_SYSCALL64_slow_path+0x25/0x25
    Code:  Bad RIP value.
    RIP  [<          (null)>]           (null)
     RSP <ffff8800b5263c48>
    CR2: 0000000000000000
    ---[ end trace 9c70c48b1a2bc66e ]---

This can be reproduced readily by preemption timer enabled on L0 and disabled
on L1.

Return false since preemption timer vmexits must never be reflected to L2.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: Iaffcd503666879e8157c8559876330110a66e5c4
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index e55417710..937898646 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -7901,6 +7901,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 	case EXIT_REASON_PCOMMIT:
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return false;
 	default:
 		return true;
 	}
-- 
2.16.6


From 594e1a937a4a019862f6b6e14516bd4e4be9e16f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 6 Jul 2016 13:23:51 +0200
Subject: [PATCH 11/16] KVM: nVMX: keep preemption timer enabled during L2
 execution

Because the vmcs12 preemption timer is emulated through a separate hrtimer,
we can keep on using the preemption timer in the vmcs02 to emulare L1's
TSC deadline timer.

However, the corresponding bit in the pin-based execution control field
must be kept consistent between vmcs01 and vmcs02.  On vmentry we copy
it into the vmcs02; on vmexit the preemption timer must be disabled in
the vmcs01 if a preemption timer vmexit happened while in guest mode.

The preemption timer value in the vmcs02 is set by vmx_vcpu_run, so it
need not be considered in prepare_vmcs02.

Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Tested-by: Wanpeng Li <kernellwp@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Change-Id: Iffaea7689d4e653dc6224a6f05c6e5ba2fb5c8a8
upstream-status: backport
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/arch/x86/kvm/vmx.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 937898646..a722f724c 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -9631,9 +9631,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
-	exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+	/* Preemption timer setting is only taken from vmcs01.  */
 	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+	exec_control |= vmcs_config.pin_based_exec_ctrl;
+	if (vmx->hv_deadline_tsc == -1)
+		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+	/* Posted interrupts setting is only taken from vmcs12.  */
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		/*
 		 * Note that we use L0's vector here and in
@@ -10562,8 +10567,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 
-	/* Update TSC_OFFSET if TSC was changed while L2 ran */
+	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+	if (vmx->hv_deadline_tsc == -1)
+		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+	else
+		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+			      PIN_BASED_VMX_PREEMPTION_TIMER);
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
-- 
2.16.6


From 853d63d099ec2f098f1f36350ec0c5095c3f4fd1 Mon Sep 17 00:00:00 2001
From: Swati Sharma <swatix.sharma@intel.com>
Date: Wed, 29 Jun 2016 02:34:43 +0900
Subject: [PATCH 12/16] Creation of QEMU(rpm and debian builds)

This includes the scripts that are added to create qemu-rpm
and qemu-debian builds for KVM4NFV.

Co-Authored-By: Gundarapu Reddy <reddyx.gundarapu@intel.com>
Signed-off-by: Swati Sharma <swatix.sharma@intel.com>
---
 ci/build_qemu_rpm_deb/mkcontrol.sh      |  8 ++++++
 ci/build_qemu_rpm_deb/mkspec            | 43 +++++++++++++++++++++++++++++++++
 ci/build_qemu_rpm_deb/mkversion         | 10 ++++++++
 ci/build_qemu_rpm_deb/qemu_build.sh     | 33 +++++++++++++++++++++++++
 ci/build_qemu_rpm_deb/qemu_deb_build.sh | 30 +++++++++++++++++++++++
 ci/build_qemu_rpm_deb/qemu_rpm_build.sh | 32 ++++++++++++++++++++++++
 6 files changed, 156 insertions(+)
 create mode 100755 ci/build_qemu_rpm_deb/mkcontrol.sh
 create mode 100755 ci/build_qemu_rpm_deb/mkspec
 create mode 100755 ci/build_qemu_rpm_deb/mkversion
 create mode 100644 ci/build_qemu_rpm_deb/qemu_build.sh
 create mode 100755 ci/build_qemu_rpm_deb/qemu_deb_build.sh
 create mode 100755 ci/build_qemu_rpm_deb/qemu_rpm_build.sh

diff --git a/ci/build_qemu_rpm_deb/mkcontrol.sh b/ci/build_qemu_rpm_deb/mkcontrol.sh
new file mode 100755
index 000000000..7eb504a0e
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/mkcontrol.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+echo "Package: qemu"
+echo "Version: $1"
+echo "Section: base"
+echo "Priority: optional"
+echo "Architecture: all"
+echo "Maintainer: Intel"
+echo "Description: control file for qemu debian build on centos"
diff --git a/ci/build_qemu_rpm_deb/mkspec b/ci/build_qemu_rpm_deb/mkspec
new file mode 100755
index 000000000..0b75a181b
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/mkspec
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+#	Output a simple RPM spec file.
+#
+# starting to output the spec
+
+QEMURELEASE=$1
+
+__QEMURELEASE=`echo $QEMURELEASE | sed -e "s/-/_/g"`
+
+echo $srctree
+echo "Name: qemu"
+echo "Summary: The Linux qemu"
+echo "Version: $__QEMURELEASE"
+# we need to determine the NEXT version number
+# rpm -q will agree
+echo "Release: `sudo sh mkversion`"
+echo "License: GPLv2+ and LGPLv2+ and BSD"
+echo "Group: Development/Tools"
+echo "Vendor: The Linux Community"
+echo "URL: http://www.qemu.org"
+echo "Source: qemu-$__QEMURELEASE.tar.gz"
+echo "%description"
+echo "%prep"
+echo "     "
+echo "%setup -q"
+echo "         "
+echo "%build"
+echo "%_configure"
+echo "make"
+echo "%install"
+echo "rm -rf %{buildroot}"
+echo "make install DESTDIR=%{buildroot}"
+echo "%clean"
+echo "rm -rf %{buildroot}"
+echo "%files"
+echo "%dir"
+echo "/usr/local/share/qemu"
+echo "%doc"
+echo "/usr/local/bin/ivshmem*"
+echo "/usr/local/bin/qemu*"
+echo "/usr/local/libexec/qemu-bridge-helper"
+echo "%changelog"
diff --git a/ci/build_qemu_rpm_deb/mkversion b/ci/build_qemu_rpm_deb/mkversion
new file mode 100755
index 000000000..fa4e585b9
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/mkversion
@@ -0,0 +1,10 @@
+if [ ! -f .version ]
+then
+    touch .version
+    sudo chmod 777 .version
+    echo 1 > .version
+    echo 1
+else
+    expr 0`cat .version` + 1
+    expr 0`cat .version` + 1 > .version
+fi
diff --git a/ci/build_qemu_rpm_deb/qemu_build.sh b/ci/build_qemu_rpm_deb/qemu_build.sh
new file mode 100644
index 000000000..a8863c3ca
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/qemu_build.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+qemu_src_dir=qemu
+workspace=/root
+debbuild_dir=$workspace/debbuild
+rpmbuild_dir=$workspace/rpmbuild
+artifact_rpms=$rpmbuild_dir/RPMS
+artifact_dir=$artifact_rpms/x86_64
+scripts_dir=ci/build_qemu_rpm_deb
+output_dir="$1"
+VERSION=`grep -m 1 "VERSION"  ${qemu_src_dir}/config-host.mak | cut -d= -f2-`
+
+usage () {
+    echo "usage: ${0} output_dir"
+    exit 1
+}
+
+if [[ -z "$@" ]]; then
+    usage
+fi
+
+if [ ! -d ${output_dir} -o ! -w ${output_dir} ] ; then
+    echo "${0}: Output directory '${output_dir}' does not exist or cannot be written"
+    exit 1
+fi
+
+if [ ! -d ${qemu_src_dir} ] ; then
+    echo "${0}: Directory '${qemu_src_dir}' does not exist, run this script from the root of kvmfornfv source tree"
+    exit 1
+fi
+
+echo
+echo "Build"
+echo
diff --git a/ci/build_qemu_rpm_deb/qemu_deb_build.sh b/ci/build_qemu_rpm_deb/qemu_deb_build.sh
new file mode 100755
index 000000000..7a830183d
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/qemu_deb_build.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#Build process for generating qemu debain file.
+
+source ci/build_qemu_rpm_deb/qemu_build.sh
+qemu_deb_build() {
+    sudo mkdir -p $debbuild_dir/qemu-$VERSION
+    sudo cp -r $qemu_src_dir $debbuild_dir/qemu-$VERSION
+    sudo mkdir -p $debbuild_dir/qemu-$VERSION/DEBIAN
+    sudo touch control
+
+#creating control file for debian build.
+    (cd ${scripts_dir}; sudo ./mkcontrol.sh $VERSION > control)
+    sudo mv $scripts_dir/control $debbuild_dir/qemu-$VERSION/DEBIAN/control
+
+#building the qemu debian with control file developed.
+    sudo dpkg-deb --build $debbuild_dir/qemu-$VERSION
+    if [ ${?} -ne 0 ] ; then
+        echo "${0}: qemu build failed"
+        exit 1
+    fi
+}
+
+if [ ! -d ${debbuild_dir} ] ; then
+    echo "creating debbuild directory"
+    sudo mkdir -p $debbuild_dir
+fi
+
+qemu_deb_build
+latest_qemu_build=`sudo ls -rt $debbuild_dir | tail -1`
+sudo cp $debbuild_dir/$latest_qemu_build build_output
diff --git a/ci/build_qemu_rpm_deb/qemu_rpm_build.sh b/ci/build_qemu_rpm_deb/qemu_rpm_build.sh
new file mode 100755
index 000000000..a52ee0f4a
--- /dev/null
+++ b/ci/build_qemu_rpm_deb/qemu_rpm_build.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#Build process for Generating qemu rpm.
+
+source ci/build_qemu_rpm_deb/qemu_build.sh
+qemu_rpm_build() {
+    sudo cp  -r ${qemu_src_dir}  ${qemu_src_dir}-$VERSION
+    sudo tar -zcvf ${qemu_src_dir}-$VERSION.tar.gz ${qemu_src_dir}-$VERSION
+    sudo mv ${qemu_src_dir}-$VERSION.tar.gz ${rpmbuild_dir}/SOURCES/
+
+#create a spec file for rpm creation.
+    (cd ${scripts_dir}; ./mkspec $VERSION > qemu.spec)
+    sudo cp ${scripts_dir}/qemu.spec ${rpmbuild_dir}/SPECS/
+
+#build the qemu rpm with spec file developed
+    sudo rpmbuild -ba ${rpmbuild_dir}/SPECS/qemu.spec
+    if [ ${?} -ne 0 ] ; then
+        echo "${0}: qemu build failed"
+        exit 1
+    fi
+    sudo rm -rf ${qemu_src_dir}-$VERSION
+    sudo rm -rf ${rpmbuild_dir}/SOURCES/${qemu_src_dir}-$VERSION.tar.gz
+}
+
+if [ ! -d ${rpmbuild_dir} ] ; then
+    sudo yum install rpm-build -y
+    mkdir -p ~/rpmbuild/{BUILD,RPMS,SOURCES,SPECS,SRPMS}
+    sudo mv rpmbuild $workspace
+fi
+
+qemu_rpm_build
+latest_qemu_build=`ls -rt $artifact_dir | grep qemu-2.6* | tail -1`
+sudo cp $artifact_dir/$latest_qemu_build build_output
-- 
2.16.6


From ef4e798bc8761c401451649ed17a52e3e1c638e8 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Tue, 18 Aug 2015 11:07:48 -0700
Subject: [PATCH 13/16] Add the "timers: do not raise softirq unconditionally"
 temporarily

This patch enable the nohz_full and is important for RT. Bring it back
temporarily, while waiting for more work on RT community.

Please refer to https://lkml.org/lkml/2015/3/17/783 for more information
of the revert.

A little rebase needed because it's reverted on old code base.

Please notice that we change the rt_mutex_trylock() so that we can get
the tvec_base lock there. This is sure to be wrong, and should be fixed
cleanly. And that's the major reason the original patch are reverted on the
upstream RT linux. Will discuss with upstream on how to achieve the
solution.

Upstream status: pending

Change-Id: I2747e087faf4145b69b800a60b8d9414bc71e206
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/kernel/locking/rtmutex.c |  7 -------
 kernel/kernel/time/timer.c      | 30 +++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/kernel/kernel/locking/rtmutex.c b/kernel/kernel/locking/rtmutex.c
index 66971005c..30777e813 100644
--- a/kernel/kernel/locking/rtmutex.c
+++ b/kernel/kernel/locking/rtmutex.c
@@ -2058,13 +2058,6 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
-#ifdef CONFIG_PREEMPT_RT_FULL
-	if (WARN_ON(in_irq() || in_nmi()))
-#else
-	if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
-#endif
-		return 0;
-
 	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c
index fee8682c2..76a301b24 100644
--- a/kernel/kernel/time/timer.c
+++ b/kernel/kernel/time/timer.c
@@ -1509,8 +1509,36 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+
 	hrtimer_run_queues();
-	raise_softirq(TIMER_SOFTIRQ);
+	/*
+	 * We can access this lockless as we are in the timer
+	 * interrupt. If there are no timers queued, nothing to do in
+	 * the timer softirq.
+	 */
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (irq_work_needs_cpu()) {
+		raise_softirq(TIMER_SOFTIRQ);
+		return;
+	}
+	if (!spin_do_trylock(&base->lock)) {
+		raise_softirq(TIMER_SOFTIRQ);
+		return;
+	}
+#endif
+	if (!base->active_timers)
+		goto out;
+
+	/* Check whether the next pending timer has expired */
+	if (time_before_eq(base->next_timer, jiffies))
+		raise_softirq(TIMER_SOFTIRQ);
+out:
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock(&base->lock);
+#endif
+	/* The ; ensures that gcc won't complain in the !RT case */
+	;
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
2.16.6


From fafab8722263f4ab08894e35f4385febbce09dd1 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Tue, 18 Aug 2015 11:23:10 -0700
Subject: [PATCH 14/16] Restore the non_hz_full

Please refer to http://comments.gmane.org/gmane.linux.kernel/1891417, the
last two mails on this discussion.

The non_hz_full is important for RT, so take it back temporarily.

Unluckily I didn't find the corresponding reverting patch on the RT tree.

Upstream status: pending

Change-Id: I2748a8f9b7a98ef2185a1da60089984432393eff

Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/kernel/time/timer.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c
index 76a301b24..016a0bf52 100644
--- a/kernel/kernel/time/timer.c
+++ b/kernel/kernel/time/timer.c
@@ -1453,9 +1453,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	 * the base lock to check when the next timer is pending and so
 	 * we assume the next jiffy.
 	 */
-	return basem + TICK_NSEC;
-#endif
+	if (!spin_do_trylock(&base->lock))
+		return basem + TICK_NSEC;
+#else
 	spin_lock(&base->lock);
+#endif
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
@@ -1465,7 +1467,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		else
 			expires = basem + (nextevt - basej) * TICK_NSEC;
 	}
+#ifdef CONFIG_PREEMPT_RT_FULL
+	rt_spin_unlock(&base->lock);
+#else
 	spin_unlock(&base->lock);
+#endif
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
-- 
2.16.6


From 6ec128a4e6a1819881f8db6659d4024aac35ce38 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sat, 14 May 2016 20:09:52 -0700
Subject: [PATCH 15/16] scripts/package/Makefile: rpmbuild add support of
 RPMOPTS

After commit 21a59991ce0c ("scripts/package/Makefile: rpmbuild is needed
for rpm targets"), it is no longer possible to specify RPMOPTS.
For example, we can no longer able to control _topdir using the following
make command.
make RPMOPTS="--define '_topdir /home/xyz/workspace/'" binrpm-pkg

Fixes: 21a59991ce0c ("scripts/package/Makefile: rpmbuild is needed for rpm targets")
Cc: <stable@vger.kernel.org> # 4.3+

Notice: Have to remove the original signed-off-by line because gerrit
will notify the original author, which is not good. Keep the name and
the email address still.

Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Michal Marek <mmarek@suse.com>

upstream-status: backport

Change-Id: If7ce5d9d006ee9d49b2d7d200ebdc278395eb3f7
backport-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Yunhong Jiang <yunhong.jiang@linux.intel.com>
---
 kernel/scripts/package/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/scripts/package/Makefile b/kernel/scripts/package/Makefile
index 1aca224e8..493e22635 100644
--- a/kernel/scripts/package/Makefile
+++ b/kernel/scripts/package/Makefile
@@ -52,7 +52,7 @@ rpm-pkg rpm: FORCE
 	$(call cmd,src_tar,$(KERNELPATH),kernel.spec)
 	$(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
 	mv -f $(objtree)/.tmp_version $(objtree)/.version
-	rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
+	rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
 	rm $(KERNELPATH).tar.gz kernel.spec
 
 # binrpm-pkg
@@ -63,7 +63,7 @@ binrpm-pkg: FORCE
 	$(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
 	mv -f $(objtree)/.tmp_version $(objtree)/.version
 
-	rpmbuild --define "_builddir $(objtree)" --target \
+	rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \
 		$(UTS_MACHINE) -bb $(objtree)/binkernel.spec
 	rm binkernel.spec
 
-- 
2.16.6


From 0d6af5dda5ef996f7a32a4ef4af84a3a67bd96e0 Mon Sep 17 00:00:00 2001
From: swatisharma <swatix.sharma@intel.com>
Date: Tue, 9 Aug 2016 19:43:46 +0530
Subject: [PATCH 16/16] CI Integration for KVMforNFV

This patch contains scripts for running cyclictest through yardstick
as part of CI integration for KVMforNFV. The scripts will be
triggered to create a docker ubuntu container for running yardstick,
configuring the host, launching a guest VM and executing cyclictest
through yardstick. The verification process gets completed after
running the cyclictest.

Co-Authored-By: Gundarapu Reddy <reddyx.gundarapu@intel.com>
Signed-off-by: Swati Sharma <swatix.sharma@intel.com>
---
 ci/envs/cyclictest.sh | 23 +++++++++++++++++++++++
 ci/test_kvmfornfv.sh  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 tests/pod.yaml        | 18 ++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 ci/envs/cyclictest.sh
 create mode 100644 ci/test_kvmfornfv.sh
 create mode 100644 tests/pod.yaml

diff --git a/ci/envs/cyclictest.sh b/ci/envs/cyclictest.sh
new file mode 100644
index 000000000..e4dd9931f
--- /dev/null
+++ b/ci/envs/cyclictest.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+###########################################################
+## Invoking this script from ubuntu docker container runs
+## cyclictest through yardstick
+###########################################################
+
+pod_config='/opt/pod.yaml'
+cyclictest_context_file='/opt/cyclictest-node-context.yaml'
+
+if [ ! -f ${pod_config} ] ; then
+    echo "file ${pod_config} not found"
+    exit 1
+fi
+
+if [ ! -f ${cyclictest_context_file} ] ; then
+    echo "file ${cyclictest_context_file} not found"
+    exit 1
+fi
+
+#Running cyclictest through yardstick
+yardstick task start ${cyclictest_context_file}
+mv /tmp/yardstick.out  /opt/
diff --git a/ci/test_kvmfornfv.sh b/ci/test_kvmfornfv.sh
new file mode 100644
index 000000000..a33814907
--- /dev/null
+++ b/ci/test_kvmfornfv.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+#############################################################
+## This script  will launch ubuntu docker container
+## runs cyclictest through yardstick
+## and verifies the test results.
+############################################################
+
+volume=/tmp/kvmtest-*
+if [ -d ${volume} ] ; then
+    echo "Directory '${volume}' exists"
+    sudo rm -rf ${volume}
+fi
+
+time_stamp=$(date +%Y%m%d%H%M%S)
+mkdir -p /tmp/kvmtest-${time_stamp}/{image,rpm,scripts}
+
+#copying required files to run yardstick cyclic testcase
+mv ${WORKSPACE}/build_output/* $volume/rpm
+cp ${WORKSPACE}/ci/envs/* $volume/scripts
+cp ${WORKSPACE}/tests/cyclictest-node-context.yaml $volume
+cp ${WORKSPACE}/tests/pod.yaml $volume
+
+#Launching ubuntu docker container to run yardstick
+sudo docker run -t -i -v $volume:/opt --net=host --name kvmfornfv \
+kvmfornfv:latest  /bin/bash /opt/scripts/cyclictest.sh
+container_id=`sudo docker ps -a | grep kvmfornfv |awk '{print $1}'`
+sudo docker rm $container_id
+
+#Verifying the results of cyclictest
+result=`grep -o '"errors":[^,]*' $volume/yardstick.out | awk -F '"' \
+'{print $4}'| awk '{if (NF=0) print "SUCCESS" }'`
+if [ "$result" = "SUCCESS" ]; then
+    echo "####################################################"
+    echo ""
+    echo `grep -o '"data":[^}]*' $volume/yardstick.out | awk -F '{' '{print $2}'`
+    echo ""
+    echo "####################################################"
+    exit 0
+else
+    echo "Testcase failed"
+    echo `grep -o '"errors":[^,]*' $volume/yardstick.out | awk -F '"' '{print $4}'`
+    exit 1
+fi
diff --git a/tests/pod.yaml b/tests/pod.yaml
new file mode 100644
index 000000000..fa66ebf7f
--- /dev/null
+++ b/tests/pod.yaml
@@ -0,0 +1,18 @@
+---
+# sample config file about the POD information, including the
+# name/IP/user/ssh key
+#
+# The options of this config file include:
+# name: the name of this node
+# role: node's role, support role: Master/Controller/Comupte/BareMetal
+# ip: the node's IP address
+# user: the username for login
+# key_filename:the path of the private key file for login
+
+nodes:
+-
+    name: kvm
+    role: Controller
+    ip: 10.2.117.23
+    user: root
+    key_filename: /root/.ssh/id_rsa
-- 
2.16.6