These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / drivers / gpu / drm / i915 / intel_ringbuffer.c
index 005b5e0..f6b2a81 100644 (file)
@@ -81,7 +81,7 @@ bool intel_ring_stopped(struct intel_engine_cs *ring)
        return dev_priv->gpu_error.stop_rings & intel_ring_flag(ring);
 }
 
-void __intel_ring_advance(struct intel_engine_cs *ring)
+static void __intel_ring_advance(struct intel_engine_cs *ring)
 {
        struct intel_ringbuffer *ringbuf = ring->buffer;
        ringbuf->tail &= ringbuf->size - 1;
@@ -91,10 +91,11 @@ void __intel_ring_advance(struct intel_engine_cs *ring)
 }
 
 static int
-gen2_render_ring_flush(struct intel_engine_cs *ring,
+gen2_render_ring_flush(struct drm_i915_gem_request *req,
                       u32      invalidate_domains,
                       u32      flush_domains)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 cmd;
        int ret;
 
@@ -105,7 +106,7 @@ gen2_render_ring_flush(struct intel_engine_cs *ring,
        if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER)
                cmd |= MI_READ_FLUSH;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -117,10 +118,11 @@ gen2_render_ring_flush(struct intel_engine_cs *ring,
 }
 
 static int
-gen4_render_ring_flush(struct intel_engine_cs *ring,
+gen4_render_ring_flush(struct drm_i915_gem_request *req,
                       u32      invalidate_domains,
                       u32      flush_domains)
 {
+       struct intel_engine_cs *ring = req->ring;
        struct drm_device *dev = ring->dev;
        u32 cmd;
        int ret;
@@ -163,7 +165,7 @@ gen4_render_ring_flush(struct intel_engine_cs *ring,
            (IS_G4X(dev) || IS_GEN5(dev)))
                cmd |= MI_INVALIDATE_ISP;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -212,13 +214,13 @@ gen4_render_ring_flush(struct intel_engine_cs *ring,
  * really our business.  That leaves only stall at scoreboard.
  */
 static int
-intel_emit_post_sync_nonzero_flush(struct intel_engine_cs *ring)
+intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
        int ret;
 
-
-       ret = intel_ring_begin(ring, 6);
+       ret = intel_ring_begin(req, 6);
        if (ret)
                return ret;
 
@@ -231,7 +233,7 @@ intel_emit_post_sync_nonzero_flush(struct intel_engine_cs *ring)
        intel_ring_emit(ring, MI_NOOP);
        intel_ring_advance(ring);
 
-       ret = intel_ring_begin(ring, 6);
+       ret = intel_ring_begin(req, 6);
        if (ret)
                return ret;
 
@@ -247,15 +249,16 @@ intel_emit_post_sync_nonzero_flush(struct intel_engine_cs *ring)
 }
 
 static int
-gen6_render_ring_flush(struct intel_engine_cs *ring,
-                         u32 invalidate_domains, u32 flush_domains)
+gen6_render_ring_flush(struct drm_i915_gem_request *req,
+                      u32 invalidate_domains, u32 flush_domains)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 flags = 0;
        u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
        int ret;
 
        /* Force SNB workarounds for PIPE_CONTROL flushes */
-       ret = intel_emit_post_sync_nonzero_flush(ring);
+       ret = intel_emit_post_sync_nonzero_flush(req);
        if (ret)
                return ret;
 
@@ -285,7 +288,7 @@ gen6_render_ring_flush(struct intel_engine_cs *ring,
                flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
        }
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -299,11 +302,12 @@ gen6_render_ring_flush(struct intel_engine_cs *ring,
 }
 
 static int
-gen7_render_ring_cs_stall_wa(struct intel_engine_cs *ring)
+gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -318,9 +322,10 @@ gen7_render_ring_cs_stall_wa(struct intel_engine_cs *ring)
 }
 
 static int
-gen7_render_ring_flush(struct intel_engine_cs *ring,
+gen7_render_ring_flush(struct drm_i915_gem_request *req,
                       u32 invalidate_domains, u32 flush_domains)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 flags = 0;
        u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
        int ret;
@@ -342,6 +347,8 @@ gen7_render_ring_flush(struct intel_engine_cs *ring,
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+               flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+               flags |= PIPE_CONTROL_FLUSH_ENABLE;
        }
        if (invalidate_domains) {
                flags |= PIPE_CONTROL_TLB_INVALIDATE;
@@ -362,10 +369,10 @@ gen7_render_ring_flush(struct intel_engine_cs *ring,
                /* Workaround: we must issue a pipe_control with CS-stall bit
                 * set before a pipe_control command that has the state cache
                 * invalidate bit set. */
-               gen7_render_ring_cs_stall_wa(ring);
+               gen7_render_ring_cs_stall_wa(req);
        }
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -379,12 +386,13 @@ gen7_render_ring_flush(struct intel_engine_cs *ring,
 }
 
 static int
-gen8_emit_pipe_control(struct intel_engine_cs *ring,
+gen8_emit_pipe_control(struct drm_i915_gem_request *req,
                       u32 flags, u32 scratch_addr)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 6);
+       ret = intel_ring_begin(req, 6);
        if (ret)
                return ret;
 
@@ -400,11 +408,11 @@ gen8_emit_pipe_control(struct intel_engine_cs *ring,
 }
 
 static int
-gen8_render_ring_flush(struct intel_engine_cs *ring,
+gen8_render_ring_flush(struct drm_i915_gem_request *req,
                       u32 invalidate_domains, u32 flush_domains)
 {
        u32 flags = 0;
-       u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = req->ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
        int ret;
 
        flags |= PIPE_CONTROL_CS_STALL;
@@ -412,6 +420,8 @@ gen8_render_ring_flush(struct intel_engine_cs *ring,
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+               flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+               flags |= PIPE_CONTROL_FLUSH_ENABLE;
        }
        if (invalidate_domains) {
                flags |= PIPE_CONTROL_TLB_INVALIDATE;
@@ -424,7 +434,7 @@ gen8_render_ring_flush(struct intel_engine_cs *ring,
                flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 
                /* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
-               ret = gen8_emit_pipe_control(ring,
+               ret = gen8_emit_pipe_control(req,
                                             PIPE_CONTROL_CS_STALL |
                                             PIPE_CONTROL_STALL_AT_SCOREBOARD,
                                             0);
@@ -432,7 +442,7 @@ gen8_render_ring_flush(struct intel_engine_cs *ring,
                        return ret;
        }
 
-       return gen8_emit_pipe_control(ring, flags, scratch_addr);
+       return gen8_emit_pipe_control(req, flags, scratch_addr);
 }
 
 static void ring_write_tail(struct intel_engine_cs *ring,
@@ -703,23 +713,23 @@ err:
        return ret;
 }
 
-static int intel_ring_workarounds_emit(struct intel_engine_cs *ring,
-                                      struct intel_context *ctx)
+static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
        int ret, i;
+       struct intel_engine_cs *ring = req->ring;
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct i915_workarounds *w = &dev_priv->workarounds;
 
-       if (WARN_ON_ONCE(w->count == 0))
+       if (w->count == 0)
                return 0;
 
        ring->gpu_caches_dirty = true;
-       ret = intel_ring_flush_all_caches(ring);
+       ret = intel_ring_flush_all_caches(req);
        if (ret)
                return ret;
 
-       ret = intel_ring_begin(ring, (w->count * 2 + 2));
+       ret = intel_ring_begin(req, (w->count * 2 + 2));
        if (ret)
                return ret;
 
@@ -733,7 +743,7 @@ static int intel_ring_workarounds_emit(struct intel_engine_cs *ring,
        intel_ring_advance(ring);
 
        ring->gpu_caches_dirty = true;
-       ret = intel_ring_flush_all_caches(ring);
+       ret = intel_ring_flush_all_caches(req);
        if (ret)
                return ret;
 
@@ -742,16 +752,15 @@ static int intel_ring_workarounds_emit(struct intel_engine_cs *ring,
        return 0;
 }
 
-static int intel_rcs_ctx_init(struct intel_engine_cs *ring,
-                             struct intel_context *ctx)
+static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
 {
        int ret;
 
-       ret = intel_ring_workarounds_emit(ring, ctx);
+       ret = intel_ring_workarounds_emit(req);
        if (ret != 0)
                return ret;
 
-       ret = i915_gem_render_state_init(ring);
+       ret = i915_gem_render_state_init(req);
        if (ret)
                DRM_ERROR("init render state: %d\n", ret);
 
@@ -775,11 +784,11 @@ static int wa_add(struct drm_i915_private *dev_priv,
        return 0;
 }
 
-#define WA_REG(addr, mask, val) { \
+#define WA_REG(addr, mask, val) do { \
                const int r = wa_add(dev_priv, (addr), (mask), (val)); \
                if (r) \
                        return r; \
-       }
+       } while (0)
 
 #define WA_SET_BIT_MASKED(addr, mask) \
        WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
@@ -795,37 +804,29 @@ static int wa_add(struct drm_i915_private *dev_priv,
 
 #define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val)
 
-static int bdw_init_workarounds(struct intel_engine_cs *ring)
+static int gen8_init_workarounds(struct intel_engine_cs *ring)
 {
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
-       /* WaDisablePartialInstShootdown:bdw */
-       /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
-       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
-                         PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE |
-                         STALL_DOP_GATING_DISABLE);
+       WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 
-       /* WaDisableDopClockGating:bdw */
-       WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
-                         DOP_CLOCK_GATING_DISABLE);
+       /* WaDisableAsyncFlipPerfMode:bdw,chv */
+       WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
 
-       WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
-                         GEN8_SAMPLER_POWER_BYPASS_DIS);
+       /* WaDisablePartialInstShootdown:bdw,chv */
+       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
+                         PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 
        /* Use Force Non-Coherent whenever executing a 3D context. This is a
         * workaround for for a possible hang in the unlikely event a TLB
         * invalidation occurs during a PSD flush.
         */
+       /* WaForceEnableNonCoherent:bdw,chv */
+       /* WaHdcDisableFetchWhenMasked:bdw,chv */
        WA_SET_BIT_MASKED(HDC_CHICKEN0,
-                         /* WaForceEnableNonCoherent:bdw */
-                         HDC_FORCE_NON_COHERENT |
-                         /* WaForceContextSaveRestoreNonCoherent:bdw */
-                         HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
-                         /* WaHdcDisableFetchWhenMasked:bdw */
                          HDC_DONOT_FETCH_MEM_WHEN_MASKED |
-                         /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
-                         (IS_BDW_GT3(dev) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
+                         HDC_FORCE_NON_COHERENT);
 
        /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
         * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
@@ -833,13 +834,12 @@ static int bdw_init_workarounds(struct intel_engine_cs *ring)
         *  stalling waiting for the earlier ones to write to Hierarchical Z
         *  buffer."
         *
-        * This optimization is off by default for Broadwell; turn it on.
+        * This optimization is off by default for BDW and CHV; turn it on.
         */
        WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 
-       /* Wa4x4STCOptimizationDisable:bdw */
-       WA_SET_BIT_MASKED(CACHE_MODE_1,
-                         GEN8_4x4_STC_OPTIMIZATION_DISABLE);
+       /* Wa4x4STCOptimizationDisable:bdw,chv */
+       WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 
        /*
         * BSpec recommends 8x4 when MSAA is used,
@@ -856,51 +856,51 @@ static int bdw_init_workarounds(struct intel_engine_cs *ring)
        return 0;
 }
 
-static int chv_init_workarounds(struct intel_engine_cs *ring)
+static int bdw_init_workarounds(struct intel_engine_cs *ring)
 {
+       int ret;
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
-       /* WaDisablePartialInstShootdown:chv */
-       /* WaDisableThreadStallDopClockGating:chv */
-       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
-                         PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE |
-                         STALL_DOP_GATING_DISABLE);
+       ret = gen8_init_workarounds(ring);
+       if (ret)
+               return ret;
+
+       /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
+       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
+
+       /* WaDisableDopClockGating:bdw */
+       WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
+                         DOP_CLOCK_GATING_DISABLE);
+
+       WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
+                         GEN8_SAMPLER_POWER_BYPASS_DIS);
 
-       /* Use Force Non-Coherent whenever executing a 3D context. This is a
-        * workaround for a possible hang in the unlikely event a TLB
-        * invalidation occurs during a PSD flush.
-        */
-       /* WaForceEnableNonCoherent:chv */
-       /* WaHdcDisableFetchWhenMasked:chv */
        WA_SET_BIT_MASKED(HDC_CHICKEN0,
-                         HDC_FORCE_NON_COHERENT |
-                         HDC_DONOT_FETCH_MEM_WHEN_MASKED);
+                         /* WaForceContextSaveRestoreNonCoherent:bdw */
+                         HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
+                         /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
+                         (IS_BDW_GT3(dev) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 
-       /* According to the CACHE_MODE_0 default value documentation, some
-        * CHV platforms disable this optimization by default.  Turn it on.
-        */
-       WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
+       return 0;
+}
+
+static int chv_init_workarounds(struct intel_engine_cs *ring)
+{
+       int ret;
+       struct drm_device *dev = ring->dev;
+       struct drm_i915_private *dev_priv = dev->dev_private;
 
-       /* Wa4x4STCOptimizationDisable:chv */
-       WA_SET_BIT_MASKED(CACHE_MODE_1,
-                         GEN8_4x4_STC_OPTIMIZATION_DISABLE);
+       ret = gen8_init_workarounds(ring);
+       if (ret)
+               return ret;
+
+       /* WaDisableThreadStallDopClockGating:chv */
+       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 
        /* Improve HiZ throughput on CHV. */
        WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 
-       /*
-        * BSpec recommends 8x4 when MSAA is used,
-        * however in practice 16x4 seems fastest.
-        *
-        * Note that PS/WM thread counts depend on the WIZ hashing
-        * disable bit, which we don't touch here, but it's good
-        * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-        */
-       WA_SET_FIELD_MASKED(GEN7_GT_MODE,
-                           GEN6_WIZ_HASHING_MASK,
-                           GEN6_WIZ_HASHING_16x4);
-
        return 0;
 }
 
@@ -908,57 +908,83 @@ static int gen9_init_workarounds(struct intel_engine_cs *ring)
 {
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
+       uint32_t tmp;
+
+       /* WaEnableLbsSlaRetryTimerDecrement:skl */
+       I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
+                  GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
+
+       /* WaDisableKillLogic:bxt,skl */
+       I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
+                  ECOCHK_DIS_TLB);
 
-       /* WaDisablePartialInstShootdown:skl */
+       /* WaDisablePartialInstShootdown:skl,bxt */
        WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
                          PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 
-       /* Syncing dependencies between camera and graphics */
+       /* Syncing dependencies between camera and graphics:skl,bxt */
        WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
                          GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);
 
-       if (INTEL_REVID(dev) == SKL_REVID_A0 ||
-           INTEL_REVID(dev) == SKL_REVID_B0) {
-               /* WaDisableDgMirrorFixInHalfSliceChicken5:skl */
+       if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) == SKL_REVID_A0 ||
+           INTEL_REVID(dev) == SKL_REVID_B0)) ||
+           (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0)) {
+               /* WaDisableDgMirrorFixInHalfSliceChicken5:skl,bxt */
                WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
                                  GEN9_DG_MIRROR_FIX_ENABLE);
        }
 
-       if (IS_SKYLAKE(dev) && INTEL_REVID(dev) <= SKL_REVID_B0) {
-               /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl */
+       if ((IS_SKYLAKE(dev) && INTEL_REVID(dev) <= SKL_REVID_B0) ||
+           (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0)) {
+               /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
                WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
                                  GEN9_RHWO_OPTIMIZATION_DISABLE);
-               WA_SET_BIT_MASKED(GEN9_SLICE_COMMON_ECO_CHICKEN0,
-                                 DISABLE_PIXEL_MASK_CAMMING);
+               /*
+                * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
+                * but we do that in per ctx batchbuffer as there is an issue
+                * with this register not getting restored on ctx restore
+                */
        }
 
-       if (INTEL_REVID(dev) >= SKL_REVID_C0) {
-               /* WaEnableYV12BugFixInHalfSliceChicken7:skl */
+       if ((IS_SKYLAKE(dev) && INTEL_REVID(dev) >= SKL_REVID_C0) ||
+           IS_BROXTON(dev)) {
+               /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt */
                WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
                                  GEN9_ENABLE_YV12_BUGFIX);
        }
 
-       if (INTEL_REVID(dev) <= SKL_REVID_D0) {
-               /*
-                *Use Force Non-Coherent whenever executing a 3D context. This
-                * is a workaround for a possible hang in the unlikely event
-                * a TLB invalidation occurs during a PSD flush.
-                */
-               /* WaForceEnableNonCoherent:skl */
-               WA_SET_BIT_MASKED(HDC_CHICKEN0,
-                                 HDC_FORCE_NON_COHERENT);
-       }
-
-       /* Wa4x4STCOptimizationDisable:skl */
-       WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
-
-       /* WaDisablePartialResolveInVc:skl */
-       WA_SET_BIT_MASKED(CACHE_MODE_1, GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
+       /* Wa4x4STCOptimizationDisable:skl,bxt */
+       /* WaDisablePartialResolveInVc:skl,bxt */
+       WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
+                                        GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
 
-       /* WaCcsTlbPrefetchDisable:skl */
+       /* WaCcsTlbPrefetchDisable:skl,bxt */
        WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
                          GEN9_CCS_TLB_PREFETCH_ENABLE);
 
+       /* WaDisableMaskBasedCammingInRCC:skl,bxt */
+       if ((IS_SKYLAKE(dev) && INTEL_REVID(dev) == SKL_REVID_C0) ||
+           (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0))
+               WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
+                                 PIXEL_MASK_CAMMING_DISABLE);
+
+       /* WaForceContextSaveRestoreNonCoherent:skl,bxt */
+       tmp = HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT;
+       if ((IS_SKYLAKE(dev) && INTEL_REVID(dev) == SKL_REVID_F0) ||
+           (IS_BROXTON(dev) && INTEL_REVID(dev) >= BXT_REVID_B0))
+               tmp |= HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE;
+       WA_SET_BIT_MASKED(HDC_CHICKEN0, tmp);
+
+       /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt */
+       if (IS_SKYLAKE(dev) ||
+           (IS_BROXTON(dev) && INTEL_REVID(dev) <= BXT_REVID_B0)) {
+               WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
+                                 GEN8_SAMPLER_POWER_BYPASS_DIS);
+       }
+
+       /* WaDisableSTUnitPowerOptimization:skl,bxt */
+       WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
+
        return 0;
 }
 
@@ -1004,19 +1030,56 @@ static int skl_tune_iz_hashing(struct intel_engine_cs *ring)
        return 0;
 }
 
-
 static int skl_init_workarounds(struct intel_engine_cs *ring)
 {
+       int ret;
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
-       gen9_init_workarounds(ring);
+       ret = gen9_init_workarounds(ring);
+       if (ret)
+               return ret;
+
+       if (INTEL_REVID(dev) <= SKL_REVID_D0) {
+               /* WaDisableHDCInvalidation:skl */
+               I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
+                          BDW_DISABLE_HDC_INVALIDATION);
+
+               /* WaDisableChickenBitTSGBarrierAckForFFSliceCS:skl */
+               I915_WRITE(FF_SLICE_CS_CHICKEN2,
+                          _MASKED_BIT_ENABLE(GEN9_TSG_BARRIER_ACK_DISABLE));
+       }
+
+       /* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
+        * involving this register should also be added to WA batch as required.
+        */
+       if (INTEL_REVID(dev) <= SKL_REVID_E0)
+               /* WaDisableLSQCROPERFforOCL:skl */
+               I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
+                          GEN8_LQSC_RO_PERF_DIS);
+
+       /* WaEnableGapsTsvCreditFix:skl */
+       if (IS_SKYLAKE(dev) && (INTEL_REVID(dev) >= SKL_REVID_C0)) {
+               I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
+                                          GEN9_GAPS_TSV_CREDIT_DISABLE));
+       }
 
        /* WaDisablePowerCompilerClockGating:skl */
        if (INTEL_REVID(dev) == SKL_REVID_B0)
                WA_SET_BIT_MASKED(HIZ_CHICKEN,
                                  BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);
 
+       if (INTEL_REVID(dev) <= SKL_REVID_D0) {
+               /*
+                *Use Force Non-Coherent whenever executing a 3D context. This
+                * is a workaround for a possible hang in the unlikely event
+                * a TLB invalidation occurs during a PSD flush.
+                */
+               /* WaForceEnableNonCoherent:skl */
+               WA_SET_BIT_MASKED(HDC_CHICKEN0,
+                                 HDC_FORCE_NON_COHERENT);
+       }
+
        if (INTEL_REVID(dev) == SKL_REVID_C0 ||
            INTEL_REVID(dev) == SKL_REVID_D0)
                /* WaBarrierPerformanceFixDisable:skl */
@@ -1024,9 +1087,51 @@ static int skl_init_workarounds(struct intel_engine_cs *ring)
                                  HDC_FENCE_DEST_SLM_DISABLE |
                                  HDC_BARRIER_PERFORMANCE_DISABLE);
 
+       /* WaDisableSbeCacheDispatchPortSharing:skl */
+       if (INTEL_REVID(dev) <= SKL_REVID_F0) {
+               WA_SET_BIT_MASKED(
+                       GEN7_HALF_SLICE_CHICKEN1,
+                       GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
+       }
+
        return skl_tune_iz_hashing(ring);
 }
 
+static int bxt_init_workarounds(struct intel_engine_cs *ring)
+{
+       int ret;
+       struct drm_device *dev = ring->dev;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       ret = gen9_init_workarounds(ring);
+       if (ret)
+               return ret;
+
+       /* WaStoreMultiplePTEenable:bxt */
+       /* This is a requirement according to Hardware specification */
+       if (INTEL_REVID(dev) == BXT_REVID_A0)
+               I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF);
+
+       /* WaSetClckGatingDisableMedia:bxt */
+       if (INTEL_REVID(dev) == BXT_REVID_A0) {
+               I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
+                                           ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE));
+       }
+
+       /* WaDisableThreadStallDopClockGating:bxt */
+       WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
+                         STALL_DOP_GATING_DISABLE);
+
+       /* WaDisableSbeCacheDispatchPortSharing:bxt */
+       if (INTEL_REVID(dev) <= BXT_REVID_B0) {
+               WA_SET_BIT_MASKED(
+                       GEN7_HALF_SLICE_CHICKEN1,
+                       GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
+       }
+
+       return 0;
+}
+
 int init_workarounds_ring(struct intel_engine_cs *ring)
 {
        struct drm_device *dev = ring->dev;
@@ -1044,8 +1149,9 @@ int init_workarounds_ring(struct intel_engine_cs *ring)
 
        if (IS_SKYLAKE(dev))
                return skl_init_workarounds(ring);
-       else if (IS_GEN9(dev))
-               return gen9_init_workarounds(ring);
+
+       if (IS_BROXTON(dev))
+               return bxt_init_workarounds(ring);
 
        return 0;
 }
@@ -1066,9 +1172,9 @@ static int init_render_ring(struct intel_engine_cs *ring)
         * to use MI_WAIT_FOR_EVENT within the CS. It should already be
         * programmed to '1' on all products.
         *
-        * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
+        * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
         */
-       if (INTEL_INFO(dev)->gen >= 6 && INTEL_INFO(dev)->gen < 9)
+       if (INTEL_INFO(dev)->gen >= 6 && INTEL_INFO(dev)->gen < 8)
                I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
 
        /* Required for the hardware to program scanline values for waiting */
@@ -1093,7 +1199,7 @@ static int init_render_ring(struct intel_engine_cs *ring)
                           _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
        }
 
-       if (INTEL_INFO(dev)->gen >= 6)
+       if (INTEL_INFO(dev)->gen >= 6 && INTEL_INFO(dev)->gen < 8)
                I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 
        if (HAS_L3_DPF(dev))
@@ -1116,10 +1222,11 @@ static void render_ring_cleanup(struct intel_engine_cs *ring)
        intel_fini_pipe_control(ring);
 }
 
-static int gen8_rcs_signal(struct intel_engine_cs *signaller,
+static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
                           unsigned int num_dwords)
 {
 #define MBOX_UPDATE_DWORDS 8
+       struct intel_engine_cs *signaller = signaller_req->ring;
        struct drm_device *dev = signaller->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_engine_cs *waiter;
@@ -1129,7 +1236,7 @@ static int gen8_rcs_signal(struct intel_engine_cs *signaller,
        num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
 #undef MBOX_UPDATE_DWORDS
 
-       ret = intel_ring_begin(signaller, num_dwords);
+       ret = intel_ring_begin(signaller_req, num_dwords);
        if (ret)
                return ret;
 
@@ -1139,8 +1246,7 @@ static int gen8_rcs_signal(struct intel_engine_cs *signaller,
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               seqno = i915_gem_request_get_seqno(
-                                          signaller->outstanding_lazy_request);
+               seqno = i915_gem_request_get_seqno(signaller_req);
                intel_ring_emit(signaller, GFX_OP_PIPE_CONTROL(6));
                intel_ring_emit(signaller, PIPE_CONTROL_GLOBAL_GTT_IVB |
                                           PIPE_CONTROL_QW_WRITE |
@@ -1157,10 +1263,11 @@ static int gen8_rcs_signal(struct intel_engine_cs *signaller,
        return 0;
 }
 
-static int gen8_xcs_signal(struct intel_engine_cs *signaller,
+static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
                           unsigned int num_dwords)
 {
 #define MBOX_UPDATE_DWORDS 6
+       struct intel_engine_cs *signaller = signaller_req->ring;
        struct drm_device *dev = signaller->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_engine_cs *waiter;
@@ -1170,7 +1277,7 @@ static int gen8_xcs_signal(struct intel_engine_cs *signaller,
        num_dwords += (num_rings-1) * MBOX_UPDATE_DWORDS;
 #undef MBOX_UPDATE_DWORDS
 
-       ret = intel_ring_begin(signaller, num_dwords);
+       ret = intel_ring_begin(signaller_req, num_dwords);
        if (ret)
                return ret;
 
@@ -1180,8 +1287,7 @@ static int gen8_xcs_signal(struct intel_engine_cs *signaller,
                if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
                        continue;
 
-               seqno = i915_gem_request_get_seqno(
-                                          signaller->outstanding_lazy_request);
+               seqno = i915_gem_request_get_seqno(signaller_req);
                intel_ring_emit(signaller, (MI_FLUSH_DW + 1) |
                                           MI_FLUSH_DW_OP_STOREDW);
                intel_ring_emit(signaller, lower_32_bits(gtt_offset) |
@@ -1196,9 +1302,10 @@ static int gen8_xcs_signal(struct intel_engine_cs *signaller,
        return 0;
 }
 
-static int gen6_signal(struct intel_engine_cs *signaller,
+static int gen6_signal(struct drm_i915_gem_request *signaller_req,
                       unsigned int num_dwords)
 {
+       struct intel_engine_cs *signaller = signaller_req->ring;
        struct drm_device *dev = signaller->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_engine_cs *useless;
@@ -1209,15 +1316,14 @@ static int gen6_signal(struct intel_engine_cs *signaller,
        num_dwords += round_up((num_rings-1) * MBOX_UPDATE_DWORDS, 2);
 #undef MBOX_UPDATE_DWORDS
 
-       ret = intel_ring_begin(signaller, num_dwords);
+       ret = intel_ring_begin(signaller_req, num_dwords);
        if (ret)
                return ret;
 
        for_each_ring(useless, dev_priv, i) {
                u32 mbox_reg = signaller->semaphore.mbox.signal[i];
                if (mbox_reg != GEN6_NOSYNC) {
-                       u32 seqno = i915_gem_request_get_seqno(
-                                          signaller->outstanding_lazy_request);
+                       u32 seqno = i915_gem_request_get_seqno(signaller_req);
                        intel_ring_emit(signaller, MI_LOAD_REGISTER_IMM(1));
                        intel_ring_emit(signaller, mbox_reg);
                        intel_ring_emit(signaller, seqno);
@@ -1233,30 +1339,29 @@ static int gen6_signal(struct intel_engine_cs *signaller,
 
 /**
  * gen6_add_request - Update the semaphore mailbox registers
- * 
- * @ring - ring that is adding a request
- * @seqno - return seqno stuck into the ring
+ *
+ * @request - request to write to the ring
  *
  * Update the mailbox registers in the *other* rings with the current seqno.
  * This acts like a signal in the canonical semaphore.
  */
 static int
-gen6_add_request(struct intel_engine_cs *ring)
+gen6_add_request(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
        if (ring->semaphore.signal)
-               ret = ring->semaphore.signal(ring, 4);
+               ret = ring->semaphore.signal(req, 4);
        else
-               ret = intel_ring_begin(ring, 4);
+               ret = intel_ring_begin(req, 4);
 
        if (ret)
                return ret;
 
        intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
        intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-       intel_ring_emit(ring,
-                   i915_gem_request_get_seqno(ring->outstanding_lazy_request));
+       intel_ring_emit(ring, i915_gem_request_get_seqno(req));
        intel_ring_emit(ring, MI_USER_INTERRUPT);
        __intel_ring_advance(ring);
 
@@ -1279,14 +1384,15 @@ static inline bool i915_gem_has_seqno_wrapped(struct drm_device *dev,
  */
 
 static int
-gen8_ring_sync(struct intel_engine_cs *waiter,
+gen8_ring_sync(struct drm_i915_gem_request *waiter_req,
               struct intel_engine_cs *signaller,
               u32 seqno)
 {
+       struct intel_engine_cs *waiter = waiter_req->ring;
        struct drm_i915_private *dev_priv = waiter->dev->dev_private;
        int ret;
 
-       ret = intel_ring_begin(waiter, 4);
+       ret = intel_ring_begin(waiter_req, 4);
        if (ret)
                return ret;
 
@@ -1304,10 +1410,11 @@ gen8_ring_sync(struct intel_engine_cs *waiter,
 }
 
 static int
-gen6_ring_sync(struct intel_engine_cs *waiter,
+gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
               struct intel_engine_cs *signaller,
               u32 seqno)
 {
+       struct intel_engine_cs *waiter = waiter_req->ring;
        u32 dw1 = MI_SEMAPHORE_MBOX |
                  MI_SEMAPHORE_COMPARE |
                  MI_SEMAPHORE_REGISTER;
@@ -1322,7 +1429,7 @@ gen6_ring_sync(struct intel_engine_cs *waiter,
 
        WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
 
-       ret = intel_ring_begin(waiter, 4);
+       ret = intel_ring_begin(waiter_req, 4);
        if (ret)
                return ret;
 
@@ -1353,8 +1460,9 @@ do {                                                                      \
 } while (0)
 
 static int
-pc_render_add_request(struct intel_engine_cs *ring)
+pc_render_add_request(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
        int ret;
 
@@ -1366,7 +1474,7 @@ pc_render_add_request(struct intel_engine_cs *ring)
         * incoherence by flushing the 6 PIPE_NOTIFY buffers out to
         * memory before requesting an interrupt.
         */
-       ret = intel_ring_begin(ring, 32);
+       ret = intel_ring_begin(req, 32);
        if (ret)
                return ret;
 
@@ -1374,8 +1482,7 @@ pc_render_add_request(struct intel_engine_cs *ring)
                        PIPE_CONTROL_WRITE_FLUSH |
                        PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
        intel_ring_emit(ring, ring->scratch.gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
-       intel_ring_emit(ring,
-                   i915_gem_request_get_seqno(ring->outstanding_lazy_request));
+       intel_ring_emit(ring, i915_gem_request_get_seqno(req));
        intel_ring_emit(ring, 0);
        PIPE_CONTROL_FLUSH(ring, scratch_addr);
        scratch_addr += 2 * CACHELINE_BYTES; /* write to separate cachelines */
@@ -1394,8 +1501,7 @@ pc_render_add_request(struct intel_engine_cs *ring)
                        PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                        PIPE_CONTROL_NOTIFY);
        intel_ring_emit(ring, ring->scratch.gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
-       intel_ring_emit(ring,
-                   i915_gem_request_get_seqno(ring->outstanding_lazy_request));
+       intel_ring_emit(ring, i915_gem_request_get_seqno(req));
        intel_ring_emit(ring, 0);
        __intel_ring_advance(ring);
 
@@ -1546,13 +1652,14 @@ i8xx_ring_put_irq(struct intel_engine_cs *ring)
 }
 
 static int
-bsd_ring_flush(struct intel_engine_cs *ring,
+bsd_ring_flush(struct drm_i915_gem_request *req,
               u32     invalidate_domains,
               u32     flush_domains)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -1563,18 +1670,18 @@ bsd_ring_flush(struct intel_engine_cs *ring,
 }
 
 static int
-i9xx_add_request(struct intel_engine_cs *ring)
+i9xx_add_request(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
        intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
        intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-       intel_ring_emit(ring,
-                   i915_gem_request_get_seqno(ring->outstanding_lazy_request));
+       intel_ring_emit(ring, i915_gem_request_get_seqno(req));
        intel_ring_emit(ring, MI_USER_INTERRUPT);
        __intel_ring_advance(ring);
 
@@ -1706,13 +1813,14 @@ gen8_ring_put_irq(struct intel_engine_cs *ring)
 }
 
 static int
-i965_dispatch_execbuffer(struct intel_engine_cs *ring,
+i965_dispatch_execbuffer(struct drm_i915_gem_request *req,
                         u64 offset, u32 length,
                         unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -1732,14 +1840,15 @@ i965_dispatch_execbuffer(struct intel_engine_cs *ring,
 #define I830_TLB_ENTRIES (2)
 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
 static int
-i830_dispatch_execbuffer(struct intel_engine_cs *ring,
+i830_dispatch_execbuffer(struct drm_i915_gem_request *req,
                         u64 offset, u32 len,
                         unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        u32 cs_offset = ring->scratch.gtt_offset;
        int ret;
 
-       ret = intel_ring_begin(ring, 6);
+       ret = intel_ring_begin(req, 6);
        if (ret)
                return ret;
 
@@ -1756,7 +1865,7 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring,
                if (len > I830_BATCH_LIMIT)
                        return -ENOSPC;
 
-               ret = intel_ring_begin(ring, 6 + 2);
+               ret = intel_ring_begin(req, 6 + 2);
                if (ret)
                        return ret;
 
@@ -1779,7 +1888,7 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring,
                offset = cs_offset;
        }
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -1794,13 +1903,14 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring,
 }
 
 static int
-i915_dispatch_execbuffer(struct intel_engine_cs *ring,
+i915_dispatch_execbuffer(struct drm_i915_gem_request *req,
                         u64 offset, u32 len,
                         unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -1928,14 +2038,14 @@ int intel_pin_and_map_ringbuffer_obj(struct drm_device *dev,
        return 0;
 }
 
-void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
+static void intel_destroy_ringbuffer_obj(struct intel_ringbuffer *ringbuf)
 {
        drm_gem_object_unreference(&ringbuf->obj->base);
        ringbuf->obj = NULL;
 }
 
-int intel_alloc_ringbuffer_obj(struct drm_device *dev,
-                              struct intel_ringbuffer *ringbuf)
+static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
+                                     struct intel_ringbuffer *ringbuf)
 {
        struct drm_i915_gem_object *obj;
 
@@ -1955,6 +2065,48 @@ int intel_alloc_ringbuffer_obj(struct drm_device *dev,
        return 0;
 }
 
+struct intel_ringbuffer *
+intel_engine_create_ringbuffer(struct intel_engine_cs *engine, int size)
+{
+       struct intel_ringbuffer *ring;
+       int ret;
+
+       ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+       if (ring == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       ring->ring = engine;
+
+       ring->size = size;
+       /* Workaround an erratum on the i830 which causes a hang if
+        * the TAIL pointer points to within the last 2 cachelines
+        * of the buffer.
+        */
+       ring->effective_size = size;
+       if (IS_I830(engine->dev) || IS_845G(engine->dev))
+               ring->effective_size -= 2 * CACHELINE_BYTES;
+
+       ring->last_retired_head = -1;
+       intel_ring_update_space(ring);
+
+       ret = intel_alloc_ringbuffer_obj(engine->dev, ring);
+       if (ret) {
+               DRM_ERROR("Failed to allocate ringbuffer %s: %d\n",
+                         engine->name, ret);
+               kfree(ring);
+               return ERR_PTR(ret);
+       }
+
+       return ring;
+}
+
+void
+intel_ringbuffer_free(struct intel_ringbuffer *ring)
+{
+       intel_destroy_ringbuffer_obj(ring);
+       kfree(ring);
+}
+
 static int intel_init_ring_buffer(struct drm_device *dev,
                                  struct intel_engine_cs *ring)
 {
@@ -1963,21 +2115,20 @@ static int intel_init_ring_buffer(struct drm_device *dev,
 
        WARN_ON(ring->buffer);
 
-       ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL);
-       if (!ringbuf)
-               return -ENOMEM;
-       ring->buffer = ringbuf;
-
        ring->dev = dev;
        INIT_LIST_HEAD(&ring->active_list);
        INIT_LIST_HEAD(&ring->request_list);
        INIT_LIST_HEAD(&ring->execlist_queue);
-       ringbuf->size = 32 * PAGE_SIZE;
-       ringbuf->ring = ring;
+       i915_gem_batch_pool_init(dev, &ring->batch_pool);
        memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
 
        init_waitqueue_head(&ring->irq_queue);
 
+       ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
+       if (IS_ERR(ringbuf))
+               return PTR_ERR(ringbuf);
+       ring->buffer = ringbuf;
+
        if (I915_NEED_GFX_HWS(dev)) {
                ret = init_status_page(ring);
                if (ret)
@@ -1989,15 +2140,6 @@ static int intel_init_ring_buffer(struct drm_device *dev,
                        goto error;
        }
 
-       WARN_ON(ringbuf->obj);
-
-       ret = intel_alloc_ringbuffer_obj(dev, ringbuf);
-       if (ret) {
-               DRM_ERROR("Failed to allocate ringbuffer %s: %d\n",
-                               ring->name, ret);
-               goto error;
-       }
-
        ret = intel_pin_and_map_ringbuffer_obj(dev, ringbuf);
        if (ret) {
                DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
@@ -2006,14 +2148,6 @@ static int intel_init_ring_buffer(struct drm_device *dev,
                goto error;
        }
 
-       /* Workaround an erratum on the i830 which causes a hang if
-        * the TAIL pointer points to within the last 2 cachelines
-        * of the buffer.
-        */
-       ringbuf->effective_size = ringbuf->size;
-       if (IS_I830(dev) || IS_845G(dev))
-               ringbuf->effective_size -= 2 * CACHELINE_BYTES;
-
        ret = i915_cmd_parser_init_ring(ring);
        if (ret)
                goto error;
@@ -2021,7 +2155,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
        return 0;
 
 error:
-       kfree(ringbuf);
+       intel_ringbuffer_free(ringbuf);
        ring->buffer = NULL;
        return ret;
 }
@@ -2029,20 +2163,18 @@ error:
 void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
 {
        struct drm_i915_private *dev_priv;
-       struct intel_ringbuffer *ringbuf;
 
        if (!intel_ring_initialized(ring))
                return;
 
        dev_priv = to_i915(ring->dev);
-       ringbuf = ring->buffer;
 
        intel_stop_ring_buffer(ring);
        WARN_ON(!IS_GEN2(ring->dev) && (I915_READ_MODE(ring) & MODE_IDLE) == 0);
 
-       intel_unpin_ringbuffer_obj(ringbuf);
-       intel_destroy_ringbuffer_obj(ringbuf);
-       i915_gem_request_assign(&ring->outstanding_lazy_request, NULL);
+       intel_unpin_ringbuffer_obj(ring->buffer);
+       intel_ringbuffer_free(ring->buffer);
+       ring->buffer = NULL;
 
        if (ring->cleanup)
                ring->cleanup(ring);
@@ -2050,103 +2182,45 @@ void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
        cleanup_status_page(ring);
 
        i915_cmd_parser_fini_ring(ring);
-
-       kfree(ringbuf);
-       ring->buffer = NULL;
+       i915_gem_batch_pool_fini(&ring->batch_pool);
 }
 
-static int intel_ring_wait_request(struct intel_engine_cs *ring, int n)
+static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
 {
        struct intel_ringbuffer *ringbuf = ring->buffer;
        struct drm_i915_gem_request *request;
+       unsigned space;
        int ret;
 
        if (intel_ring_space(ringbuf) >= n)
                return 0;
 
+       /* The whole point of reserving space is to not wait! */
+       WARN_ON(ringbuf->reserved_in_use);
+
        list_for_each_entry(request, &ring->request_list, list) {
-               if (__intel_ring_space(request->postfix, ringbuf->tail,
-                                      ringbuf->size) >= n) {
+               space = __intel_ring_space(request->postfix, ringbuf->tail,
+                                          ringbuf->size);
+               if (space >= n)
                        break;
-               }
        }
 
-       if (&request->list == &ring->request_list)
+       if (WARN_ON(&request->list == &ring->request_list))
                return -ENOSPC;
 
        ret = i915_wait_request(request);
        if (ret)
                return ret;
 
-       i915_gem_retire_requests_ring(ring);
-
+       ringbuf->space = space;
        return 0;
 }
 
-static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
-{
-       struct drm_device *dev = ring->dev;
-       struct drm_i915_private *dev_priv = dev->dev_private;
-       struct intel_ringbuffer *ringbuf = ring->buffer;
-       unsigned long end;
-       int ret;
-
-       ret = intel_ring_wait_request(ring, n);
-       if (ret != -ENOSPC)
-               return ret;
-
-       /* force the tail write in case we have been skipping them */
-       __intel_ring_advance(ring);
-
-       /* With GEM the hangcheck timer should kick us out of the loop,
-        * leaving it early runs the risk of corrupting GEM state (due
-        * to running on almost untested codepaths). But on resume
-        * timers don't work yet, so prevent a complete hang in that
-        * case by choosing an insanely large timeout. */
-       end = jiffies + 60 * HZ;
-
-       ret = 0;
-       trace_i915_ring_wait_begin(ring);
-       do {
-               if (intel_ring_space(ringbuf) >= n)
-                       break;
-               ringbuf->head = I915_READ_HEAD(ring);
-               if (intel_ring_space(ringbuf) >= n)
-                       break;
-
-               msleep(1);
-
-               if (dev_priv->mm.interruptible && signal_pending(current)) {
-                       ret = -ERESTARTSYS;
-                       break;
-               }
-
-               ret = i915_gem_check_wedge(&dev_priv->gpu_error,
-                                          dev_priv->mm.interruptible);
-               if (ret)
-                       break;
-
-               if (time_after(jiffies, end)) {
-                       ret = -EBUSY;
-                       break;
-               }
-       } while (1);
-       trace_i915_ring_wait_end(ring);
-       return ret;
-}
-
-static int intel_wrap_ring_buffer(struct intel_engine_cs *ring)
+static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
 {
        uint32_t __iomem *virt;
-       struct intel_ringbuffer *ringbuf = ring->buffer;
        int rem = ringbuf->size - ringbuf->tail;
 
-       if (ringbuf->space < rem) {
-               int ret = ring_wait_for_space(ring, rem);
-               if (ret)
-                       return ret;
-       }
-
        virt = ringbuf->virtual_start + ringbuf->tail;
        rem /= 4;
        while (rem--)
@@ -2154,89 +2228,153 @@ static int intel_wrap_ring_buffer(struct intel_engine_cs *ring)
 
        ringbuf->tail = 0;
        intel_ring_update_space(ringbuf);
-
-       return 0;
 }
 
 int intel_ring_idle(struct intel_engine_cs *ring)
 {
        struct drm_i915_gem_request *req;
-       int ret;
-
-       /* We need to add any requests required to flush the objects and ring */
-       if (ring->outstanding_lazy_request) {
-               ret = i915_add_request(ring);
-               if (ret)
-                       return ret;
-       }
 
        /* Wait upon the last request to be completed */
        if (list_empty(&ring->request_list))
                return 0;
 
        req = list_entry(ring->request_list.prev,
-                          struct drm_i915_gem_request,
-                          list);
+                       struct drm_i915_gem_request,
+                       list);
 
-       return i915_wait_request(req);
+       /* Make sure we do not trigger any retires */
+       return __i915_wait_request(req,
+                                  atomic_read(&to_i915(ring->dev)->gpu_error.reset_counter),
+                                  to_i915(ring->dev)->mm.interruptible,
+                                  NULL, NULL);
 }
 
-static int
-intel_ring_alloc_request(struct intel_engine_cs *ring)
+int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
-       int ret;
-       struct drm_i915_gem_request *request;
-       struct drm_i915_private *dev_private = ring->dev->dev_private;
+       request->ringbuf = request->ring->buffer;
+       return 0;
+}
 
-       if (ring->outstanding_lazy_request)
-               return 0;
+int intel_ring_reserve_space(struct drm_i915_gem_request *request)
+{
+       /*
+        * The first call merely notes the reserve request and is common for
+        * all back ends. The subsequent localised _begin() call actually
+        * ensures that the reservation is available. Without the begin, if
+        * the request creator immediately submitted the request without
+        * adding any commands to it then there might not actually be
+        * sufficient room for the submission commands.
+        */
+       intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST);
 
-       request = kzalloc(sizeof(*request), GFP_KERNEL);
-       if (request == NULL)
-               return -ENOMEM;
+       return intel_ring_begin(request, 0);
+}
 
-       kref_init(&request->ref);
-       request->ring = ring;
-       request->ringbuf = ring->buffer;
-       request->uniq = dev_private->request_uniq++;
+void intel_ring_reserved_space_reserve(struct intel_ringbuffer *ringbuf, int size)
+{
+       WARN_ON(ringbuf->reserved_size);
+       WARN_ON(ringbuf->reserved_in_use);
 
-       ret = i915_gem_get_seqno(ring->dev, &request->seqno);
-       if (ret) {
-               kfree(request);
-               return ret;
+       ringbuf->reserved_size = size;
+}
+
+void intel_ring_reserved_space_cancel(struct intel_ringbuffer *ringbuf)
+{
+       WARN_ON(ringbuf->reserved_in_use);
+
+       ringbuf->reserved_size   = 0;
+       ringbuf->reserved_in_use = false;
+}
+
+void intel_ring_reserved_space_use(struct intel_ringbuffer *ringbuf)
+{
+       WARN_ON(ringbuf->reserved_in_use);
+
+       ringbuf->reserved_in_use = true;
+       ringbuf->reserved_tail   = ringbuf->tail;
+}
+
+void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf)
+{
+       WARN_ON(!ringbuf->reserved_in_use);
+       if (ringbuf->tail > ringbuf->reserved_tail) {
+               WARN(ringbuf->tail > ringbuf->reserved_tail + ringbuf->reserved_size,
+                    "request reserved size too small: %d vs %d!\n",
+                    ringbuf->tail - ringbuf->reserved_tail, ringbuf->reserved_size);
+       } else {
+               /*
+                * The ring was wrapped while the reserved space was in use.
+                * That means that some unknown amount of the ring tail was
+                * no-op filled and skipped. Thus simply adding the ring size
+                * to the tail and doing the above space check will not work.
+                * Rather than attempt to track how much tail was skipped,
+                * it is much simpler to say that also skipping the sanity
+                * check every once in a while is not a big issue.
+                */
        }
 
-       ring->outstanding_lazy_request = request;
-       return 0;
+       ringbuf->reserved_size   = 0;
+       ringbuf->reserved_in_use = false;
 }
 
-static int __intel_ring_prepare(struct intel_engine_cs *ring,
-                               int bytes)
+static int __intel_ring_prepare(struct intel_engine_cs *ring, int bytes)
 {
        struct intel_ringbuffer *ringbuf = ring->buffer;
-       int ret;
+       int remain_usable = ringbuf->effective_size - ringbuf->tail;
+       int remain_actual = ringbuf->size - ringbuf->tail;
+       int ret, total_bytes, wait_bytes = 0;
+       bool need_wrap = false;
 
-       if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) {
-               ret = intel_wrap_ring_buffer(ring);
-               if (unlikely(ret))
-                       return ret;
+       if (ringbuf->reserved_in_use)
+               total_bytes = bytes;
+       else
+               total_bytes = bytes + ringbuf->reserved_size;
+
+       if (unlikely(bytes > remain_usable)) {
+               /*
+                * Not enough space for the basic request. So need to flush
+                * out the remainder and then wait for base + reserved.
+                */
+               wait_bytes = remain_actual + total_bytes;
+               need_wrap = true;
+       } else {
+               if (unlikely(total_bytes > remain_usable)) {
+                       /*
+                        * The base request will fit but the reserved space
+                        * falls off the end. So only need to to wait for the
+                        * reserved size after flushing out the remainder.
+                        */
+                       wait_bytes = remain_actual + ringbuf->reserved_size;
+                       need_wrap = true;
+               } else if (total_bytes > ringbuf->space) {
+                       /* No wrapping required, just waiting. */
+                       wait_bytes = total_bytes;
+               }
        }
 
-       if (unlikely(ringbuf->space < bytes)) {
-               ret = ring_wait_for_space(ring, bytes);
+       if (wait_bytes) {
+               ret = ring_wait_for_space(ring, wait_bytes);
                if (unlikely(ret))
                        return ret;
+
+               if (need_wrap)
+                       __wrap_ring_buffer(ringbuf);
        }
 
        return 0;
 }
 
-int intel_ring_begin(struct intel_engine_cs *ring,
+int intel_ring_begin(struct drm_i915_gem_request *req,
                     int num_dwords)
 {
-       struct drm_i915_private *dev_priv = ring->dev->dev_private;
+       struct intel_engine_cs *ring;
+       struct drm_i915_private *dev_priv;
        int ret;
 
+       WARN_ON(req == NULL);
+       ring = req->ring;
+       dev_priv = ring->dev->dev_private;
+
        ret = i915_gem_check_wedge(&dev_priv->gpu_error,
                                   dev_priv->mm.interruptible);
        if (ret)
@@ -2246,18 +2384,14 @@ int intel_ring_begin(struct intel_engine_cs *ring,
        if (ret)
                return ret;
 
-       /* Preallocate the olr before touching the ring */
-       ret = intel_ring_alloc_request(ring);
-       if (ret)
-               return ret;
-
        ring->buffer->space -= num_dwords * sizeof(uint32_t);
        return 0;
 }
 
 /* Align the ring tail to a cacheline boundary */
-int intel_ring_cacheline_align(struct intel_engine_cs *ring)
+int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        int num_dwords = (ring->buffer->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
        int ret;
 
@@ -2265,7 +2399,7 @@ int intel_ring_cacheline_align(struct intel_engine_cs *ring)
                return 0;
 
        num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
-       ret = intel_ring_begin(ring, num_dwords);
+       ret = intel_ring_begin(req, num_dwords);
        if (ret)
                return ret;
 
@@ -2282,8 +2416,6 @@ void intel_ring_init_seqno(struct intel_engine_cs *ring, u32 seqno)
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
-       BUG_ON(ring->outstanding_lazy_request);
-
        if (INTEL_INFO(dev)->gen == 6 || INTEL_INFO(dev)->gen == 7) {
                I915_WRITE(RING_SYNC_0(ring->mmio_base), 0);
                I915_WRITE(RING_SYNC_1(ring->mmio_base), 0);
@@ -2328,13 +2460,14 @@ static void gen6_bsd_ring_write_tail(struct intel_engine_cs *ring,
                   _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
 }
 
-static int gen6_bsd_ring_flush(struct intel_engine_cs *ring,
+static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req,
                               u32 invalidate, u32 flush)
 {
+       struct intel_engine_cs *ring = req->ring;
        uint32_t cmd;
        int ret;
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -2372,20 +2505,23 @@ static int gen6_bsd_ring_flush(struct intel_engine_cs *ring,
 }
 
 static int
-gen8_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
+gen8_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
                              u64 offset, u32 len,
                              unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        bool ppgtt = USES_PPGTT(ring->dev) &&
                        !(dispatch_flags & I915_DISPATCH_SECURE);
        int ret;
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
        /* FIXME(BDW): Address space and security selectors. */
-       intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8));
+       intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) |
+                       (dispatch_flags & I915_DISPATCH_RS ?
+                        MI_BATCH_RESOURCE_STREAMER : 0));
        intel_ring_emit(ring, lower_32_bits(offset));
        intel_ring_emit(ring, upper_32_bits(offset));
        intel_ring_emit(ring, MI_NOOP);
@@ -2395,20 +2531,23 @@ gen8_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
 }
 
 static int
-hsw_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
+hsw_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
                             u64 offset, u32 len,
                             unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
        intel_ring_emit(ring,
                        MI_BATCH_BUFFER_START |
                        (dispatch_flags & I915_DISPATCH_SECURE ?
-                        0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW));
+                        0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
+                       (dispatch_flags & I915_DISPATCH_RS ?
+                        MI_BATCH_RESOURCE_STREAMER : 0));
        /* bit0-7 is the length on GEN6+ */
        intel_ring_emit(ring, offset);
        intel_ring_advance(ring);
@@ -2417,13 +2556,14 @@ hsw_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
 }
 
 static int
-gen6_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
+gen6_ring_dispatch_execbuffer(struct drm_i915_gem_request *req,
                              u64 offset, u32 len,
                              unsigned dispatch_flags)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
-       ret = intel_ring_begin(ring, 2);
+       ret = intel_ring_begin(req, 2);
        if (ret)
                return ret;
 
@@ -2440,14 +2580,15 @@ gen6_ring_dispatch_execbuffer(struct intel_engine_cs *ring,
 
 /* Blitter support (SandyBridge+) */
 
-static int gen6_ring_flush(struct intel_engine_cs *ring,
+static int gen6_ring_flush(struct drm_i915_gem_request *req,
                           u32 invalidate, u32 flush)
 {
+       struct intel_engine_cs *ring = req->ring;
        struct drm_device *dev = ring->dev;
        uint32_t cmd;
        int ret;
 
-       ret = intel_ring_begin(ring, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -2528,6 +2669,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
                        GEN8_RING_SEMAPHORE_INIT;
                }
        } else if (INTEL_INFO(dev)->gen >= 6) {
+               ring->init_context = intel_rcs_ctx_init;
                ring->add_request = gen6_add_request;
                ring->flush = gen7_render_ring_flush;
                if (INTEL_INFO(dev)->gen == 6)
@@ -2848,26 +2990,28 @@ int intel_init_vebox_ring_buffer(struct drm_device *dev)
 }
 
 int
-intel_ring_flush_all_caches(struct intel_engine_cs *ring)
+intel_ring_flush_all_caches(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        int ret;
 
        if (!ring->gpu_caches_dirty)
                return 0;
 
-       ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS);
+       ret = ring->flush(req, 0, I915_GEM_GPU_DOMAINS);
        if (ret)
                return ret;
 
-       trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
+       trace_i915_gem_ring_flush(req, 0, I915_GEM_GPU_DOMAINS);
 
        ring->gpu_caches_dirty = false;
        return 0;
 }
 
 int
-intel_ring_invalidate_all_caches(struct intel_engine_cs *ring)
+intel_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
 {
+       struct intel_engine_cs *ring = req->ring;
        uint32_t flush_domains;
        int ret;
 
@@ -2875,11 +3019,11 @@ intel_ring_invalidate_all_caches(struct intel_engine_cs *ring)
        if (ring->gpu_caches_dirty)
                flush_domains = I915_GEM_GPU_DOMAINS;
 
-       ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+       ret = ring->flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
        if (ret)
                return ret;
 
-       trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+       trace_i915_gem_ring_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
 
        ring->gpu_caches_dirty = false;
        return 0;