X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=kernel%2Fdrivers%2Fgpu%2Fdrm%2Fi915%2Fintel_lrc.c;fp=kernel%2Fdrivers%2Fgpu%2Fdrm%2Fi915%2Fintel_lrc.c;h=d69547a65dbb95599c400e864b95c779863ef995;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hp=9ab7c1c758aeb06391984d1c5d8ff062ae89711a;hpb=f93b97fd65072de626c074dbe099a1fff05ce060;p=kvmfornfv.git diff --git a/kernel/drivers/gpu/drm/i915/intel_lrc.c b/kernel/drivers/gpu/drm/i915/intel_lrc.c index 9ab7c1c75..d69547a65 100644 --- a/kernel/drivers/gpu/drm/i915/intel_lrc.c +++ b/kernel/drivers/gpu/drm/i915/intel_lrc.c @@ -135,6 +135,7 @@ #include #include #include "i915_drv.h" +#include "intel_mocs.h" #define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) #define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) @@ -188,13 +189,28 @@ #define GEN8_CTX_FORCE_RESTORE (1<<2) #define GEN8_CTX_L3LLC_COHERENT (1<<5) #define GEN8_CTX_PRIVILEGE (1<<8) + +#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \ + const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \ + reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ + reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ +} + +#define ASSIGN_CTX_PML4(ppgtt, reg_state) { \ + reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \ + reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ +} + enum { ADVANCED_CONTEXT = 0, - LEGACY_CONTEXT, + LEGACY_32B_CONTEXT, ADVANCED_AD_CONTEXT, LEGACY_64B_CONTEXT }; -#define GEN8_CTX_MODE_SHIFT 3 +#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3 +#define GEN8_CTX_ADDRESSING_MODE(dev) (USES_FULL_48BIT_PPGTT(dev) ?\ + LEGACY_64B_CONTEXT :\ + LEGACY_32B_CONTEXT) enum { FAULT_AND_HANG = 0, FAULT_AND_HALT, /* Debug only */ @@ -202,9 +218,12 @@ enum { FAULT_AND_CONTINUE /* Unsupported */ }; #define GEN8_CTX_ID_SHIFT 32 +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 + +static int intel_lr_context_pin(struct drm_i915_gem_request *rq); +static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, + struct drm_i915_gem_object *default_ctx_obj); -static int intel_lr_context_pin(struct intel_engine_cs *ring, - struct intel_context *ctx); /** * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists @@ -220,6 +239,12 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists { WARN_ON(i915.enable_ppgtt == -1); + /* On platforms with execlist available, vGPU will only + * support execlist mode, no ring buffer mode. + */ + if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev)) + return 1; + if (INTEL_INFO(dev)->gen >= 9) return 1; @@ -247,25 +272,37 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists */ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) { - u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj); + u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) + + LRC_PPHWSP_PN * PAGE_SIZE; /* LRCA is required to be 4K aligned so the more significant 20 bits * are globally unique */ return lrca >> 12; } -static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring, - struct drm_i915_gem_object *ctx_obj) +static bool disable_lite_restore_wa(struct intel_engine_cs *ring) { struct drm_device *dev = ring->dev; + + return ((IS_SKYLAKE(dev) && INTEL_REVID(dev) <= SKL_REVID_B0) || + (IS_BROXTON(dev) && INTEL_REVID(dev) == BXT_REVID_A0)) && + (ring->id == VCS || ring->id == VCS2); +} + +uint64_t intel_lr_context_descriptor(struct intel_context *ctx, + struct intel_engine_cs *ring) +{ + struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; uint64_t desc; - uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj); + uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) + + LRC_PPHWSP_PN * PAGE_SIZE; WARN_ON(lrca & 0xFFFFFFFF00000FFFULL); desc = GEN8_CTX_VALID; - desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT; - desc |= GEN8_CTX_L3LLC_COHERENT; + desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT; + if (IS_GEN8(ctx_obj->base.dev)) + desc |= GEN8_CTX_L3LLC_COHERENT; desc |= GEN8_CTX_PRIVILEGE; desc |= lrca; desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; @@ -275,93 +312,93 @@ static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring, /* desc |= GEN8_CTX_FORCE_RESTORE; */ /* WaEnableForceRestoreInCtxtDescForVCS:skl */ - if (IS_GEN9(dev) && - INTEL_REVID(dev) <= SKL_REVID_B0 && - (ring->id == BCS || ring->id == VCS || - ring->id == VECS || ring->id == VCS2)) + /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ + if (disable_lite_restore_wa(ring)) desc |= GEN8_CTX_FORCE_RESTORE; return desc; } -static void execlists_elsp_write(struct intel_engine_cs *ring, - struct drm_i915_gem_object *ctx_obj0, - struct drm_i915_gem_object *ctx_obj1) +static void execlists_elsp_write(struct drm_i915_gem_request *rq0, + struct drm_i915_gem_request *rq1) { + + struct intel_engine_cs *ring = rq0->ring; struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; - uint64_t temp = 0; - uint32_t desc[4]; + uint64_t desc[2]; - /* XXX: You must always write both descriptors in the order below. */ - if (ctx_obj1) - temp = execlists_ctx_descriptor(ring, ctx_obj1); - else - temp = 0; - desc[1] = (u32)(temp >> 32); - desc[0] = (u32)temp; + if (rq1) { + desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring); + rq1->elsp_submitted++; + } else { + desc[1] = 0; + } - temp = execlists_ctx_descriptor(ring, ctx_obj0); - desc[3] = (u32)(temp >> 32); - desc[2] = (u32)temp; + desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring); + rq0->elsp_submitted++; - intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); - I915_WRITE(RING_ELSP(ring), desc[1]); - I915_WRITE(RING_ELSP(ring), desc[0]); - I915_WRITE(RING_ELSP(ring), desc[3]); + /* You must always write both descriptors in the order below. */ + spin_lock(&dev_priv->uncore.lock); + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); + I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); + I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); + I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0])); /* The context is automatically loaded after the following */ - I915_WRITE(RING_ELSP(ring), desc[2]); + I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0])); - /* ELSP is a wo register, so use another nearby reg for posting instead */ - POSTING_READ(RING_EXECLIST_STATUS(ring)); - intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + /* ELSP is a wo register, use another nearby reg for posting */ + POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); + intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); + spin_unlock(&dev_priv->uncore.lock); } -static int execlists_update_context(struct drm_i915_gem_object *ctx_obj, - struct drm_i915_gem_object *ring_obj, - u32 tail) +static int execlists_update_context(struct drm_i915_gem_request *rq) { + struct intel_engine_cs *ring = rq->ring; + struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; + struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; + struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj; struct page *page; uint32_t *reg_state; - page = i915_gem_object_get_page(ctx_obj, 1); + BUG_ON(!ctx_obj); + WARN_ON(!i915_gem_obj_is_pinned(ctx_obj)); + WARN_ON(!i915_gem_obj_is_pinned(rb_obj)); + + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); reg_state = kmap_atomic(page); - reg_state[CTX_RING_TAIL+1] = tail; - reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj); + reg_state[CTX_RING_TAIL+1] = rq->tail; + reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj); + + if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { + /* True 32b PPGTT with dynamic page allocation: update PDP + * registers and point the unallocated PDPs to scratch page. + * PML4 is allocated during ppgtt init, so this is not needed + * in 48-bit mode. + */ + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); + } kunmap_atomic(reg_state); return 0; } -static void execlists_submit_contexts(struct intel_engine_cs *ring, - struct intel_context *to0, u32 tail0, - struct intel_context *to1, u32 tail1) +static void execlists_submit_requests(struct drm_i915_gem_request *rq0, + struct drm_i915_gem_request *rq1) { - struct drm_i915_gem_object *ctx_obj0 = to0->engine[ring->id].state; - struct intel_ringbuffer *ringbuf0 = to0->engine[ring->id].ringbuf; - struct drm_i915_gem_object *ctx_obj1 = NULL; - struct intel_ringbuffer *ringbuf1 = NULL; - - BUG_ON(!ctx_obj0); - WARN_ON(!i915_gem_obj_is_pinned(ctx_obj0)); - WARN_ON(!i915_gem_obj_is_pinned(ringbuf0->obj)); + execlists_update_context(rq0); - execlists_update_context(ctx_obj0, ringbuf0->obj, tail0); + if (rq1) + execlists_update_context(rq1); - if (to1) { - ringbuf1 = to1->engine[ring->id].ringbuf; - ctx_obj1 = to1->engine[ring->id].state; - BUG_ON(!ctx_obj1); - WARN_ON(!i915_gem_obj_is_pinned(ctx_obj1)); - WARN_ON(!i915_gem_obj_is_pinned(ringbuf1->obj)); - - execlists_update_context(ctx_obj1, ringbuf1->obj, tail1); - } - - execlists_elsp_write(ring, ctx_obj0, ctx_obj1); + execlists_elsp_write(rq0, rq1); } static void execlists_context_unqueue(struct intel_engine_cs *ring) @@ -371,6 +408,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) assert_spin_locked(&ring->execlist_lock); + /* + * If irqs are not active generate a warning as batches that finish + * without the irqs may get lost and a GPU Hang may occur. + */ + WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); + if (list_empty(&ring->execlist_queue)) return; @@ -398,7 +441,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) * WaIdleLiteRestore: make sure we never cause a lite * restore with HEAD==TAIL */ - if (req0 && req0->elsp_submitted) { + if (req0->elsp_submitted) { /* * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL * as we resubmit the request. See gen8_emit_request() @@ -415,13 +458,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) WARN_ON(req1 && req1->elsp_submitted); - execlists_submit_contexts(ring, req0->ctx, req0->tail, - req1 ? req1->ctx : NULL, - req1 ? req1->tail : 0); - - req0->elsp_submitted++; - if (req1) - req1->elsp_submitted++; + execlists_submit_requests(req0, req1); } static bool execlists_check_remove_request(struct intel_engine_cs *ring, @@ -467,25 +504,26 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring) u32 status_pointer; u8 read_pointer; u8 write_pointer; - u32 status; + u32 status = 0; u32 status_id; u32 submit_contexts = 0; status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); read_pointer = ring->next_context_status_buffer; - write_pointer = status_pointer & 0x07; + write_pointer = status_pointer & GEN8_CSB_PTR_MASK; if (read_pointer > write_pointer) - write_pointer += 6; + write_pointer += GEN8_CSB_ENTRIES; spin_lock(&ring->execlist_lock); while (read_pointer < write_pointer) { read_pointer++; - status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) + - (read_pointer % 6) * 8); - status_id = I915_READ(RING_CONTEXT_STATUS_BUF(ring) + - (read_pointer % 6) * 8 + 4); + status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer % GEN8_CSB_ENTRIES)); + status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer % GEN8_CSB_ENTRIES)); + + if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) + continue; if (status & GEN8_CTX_STATUS_PREEMPTED) { if (status & GEN8_CTX_STATUS_LITE_RESTORE) { @@ -502,53 +540,38 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring) } } - if (submit_contexts != 0) + if (disable_lite_restore_wa(ring)) { + /* Prevent a ctx to preempt itself */ + if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) && + (submit_contexts != 0)) + execlists_context_unqueue(ring); + } else if (submit_contexts != 0) { execlists_context_unqueue(ring); + } spin_unlock(&ring->execlist_lock); WARN(submit_contexts > 2, "More than two context complete events?\n"); - ring->next_context_status_buffer = write_pointer % 6; + ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), - ((u32)ring->next_context_status_buffer & 0x07) << 8); + _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8, + ((u32)ring->next_context_status_buffer & + GEN8_CSB_PTR_MASK) << 8)); } -static int execlists_context_queue(struct intel_engine_cs *ring, - struct intel_context *to, - u32 tail, - struct drm_i915_gem_request *request) +static int execlists_context_queue(struct drm_i915_gem_request *request) { + struct intel_engine_cs *ring = request->ring; struct drm_i915_gem_request *cursor; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; int num_elements = 0; - if (to != ring->default_context) - intel_lr_context_pin(ring, to); + if (request->ctx != ring->default_context) + intel_lr_context_pin(request); - if (!request) { - /* - * If there isn't a request associated with this submission, - * create one as a temporary holder. - */ - request = kzalloc(sizeof(*request), GFP_KERNEL); - if (request == NULL) - return -ENOMEM; - request->ring = ring; - request->ctx = to; - kref_init(&request->ref); - request->uniq = dev_priv->request_uniq++; - i915_gem_context_reference(request->ctx); - } else { - i915_gem_request_reference(request); - WARN_ON(to != request->ctx); - } - request->tail = tail; - - intel_runtime_pm_get(dev_priv); + i915_gem_request_reference(request); - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) if (++num_elements > 2) @@ -561,7 +584,7 @@ static int execlists_context_queue(struct intel_engine_cs *ring, struct drm_i915_gem_request, execlist_link); - if (to == tail_req->ctx) { + if (request->ctx == tail_req->ctx) { WARN(tail_req->elsp_submitted != 0, "More than 2 already-submitted reqs queued\n"); list_del(&tail_req->execlist_link); @@ -574,15 +597,14 @@ static int execlists_context_queue(struct intel_engine_cs *ring, if (num_elements == 0) execlists_context_unqueue(ring); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); return 0; } -static int logical_ring_invalidate_all_caches(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx) +static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req) { - struct intel_engine_cs *ring = ringbuf->ring; + struct intel_engine_cs *ring = req->ring; uint32_t flush_domains; int ret; @@ -590,8 +612,7 @@ static int logical_ring_invalidate_all_caches(struct intel_ringbuffer *ringbuf, if (ring->gpu_caches_dirty) flush_domains = I915_GEM_GPU_DOMAINS; - ret = ring->emit_flush(ringbuf, ctx, - I915_GEM_GPU_DOMAINS, flush_domains); + ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains); if (ret) return ret; @@ -599,11 +620,10 @@ static int logical_ring_invalidate_all_caches(struct intel_ringbuffer *ringbuf, return 0; } -static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, +static int execlists_move_to_gpu(struct drm_i915_gem_request *req, struct list_head *vmas) { - struct intel_engine_cs *ring = ringbuf->ring; + const unsigned other_rings = ~intel_ring_flag(req->ring); struct i915_vma *vma; uint32_t flush_domains = 0; bool flush_chipset = false; @@ -612,9 +632,11 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, list_for_each_entry(vma, vmas, exec_list) { struct drm_i915_gem_object *obj = vma->obj; - ret = i915_gem_object_sync(obj, ring); - if (ret) - return ret; + if (obj->active & other_rings) { + ret = i915_gem_object_sync(obj, req->ring, &req); + if (ret) + return ret; + } if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) flush_chipset |= i915_gem_clflush_object(obj, false); @@ -628,7 +650,202 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, /* Unconditionally invalidate gpu caches and ensure that we do flush * any residual writes from the previous batch. */ - return logical_ring_invalidate_all_caches(ringbuf, ctx); + return logical_ring_invalidate_all_caches(req); +} + +int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) +{ + int ret; + + request->ringbuf = request->ctx->engine[request->ring->id].ringbuf; + + if (request->ctx != request->ring->default_context) { + ret = intel_lr_context_pin(request); + if (ret) + return ret; + } + + return 0; +} + +static int logical_ring_wait_for_space(struct drm_i915_gem_request *req, + int bytes) +{ + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct intel_engine_cs *ring = req->ring; + struct drm_i915_gem_request *target; + unsigned space; + int ret; + + if (intel_ring_space(ringbuf) >= bytes) + return 0; + + /* The whole point of reserving space is to not wait! */ + WARN_ON(ringbuf->reserved_in_use); + + list_for_each_entry(target, &ring->request_list, list) { + /* + * The request queue is per-engine, so can contain requests + * from multiple ringbuffers. Here, we must ignore any that + * aren't from the ringbuffer we're considering. + */ + if (target->ringbuf != ringbuf) + continue; + + /* Would completion of this request free enough space? */ + space = __intel_ring_space(target->postfix, ringbuf->tail, + ringbuf->size); + if (space >= bytes) + break; + } + + if (WARN_ON(&target->list == &ring->request_list)) + return -ENOSPC; + + ret = i915_wait_request(target); + if (ret) + return ret; + + ringbuf->space = space; + return 0; +} + +/* + * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload + * @request: Request to advance the logical ringbuffer of. + * + * The tail is updated in our logical ringbuffer struct, not in the actual context. What + * really happens during submission is that the context and current tail will be placed + * on a queue waiting for the ELSP to be ready to accept a new context submission. At that + * point, the tail *inside* the context is updated and the ELSP written to. + */ +static void +intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request) +{ + struct intel_engine_cs *ring = request->ring; + struct drm_i915_private *dev_priv = request->i915; + + intel_logical_ring_advance(request->ringbuf); + + request->tail = request->ringbuf->tail; + + if (intel_ring_stopped(ring)) + return; + + if (dev_priv->guc.execbuf_client) + i915_guc_submit(dev_priv->guc.execbuf_client, request); + else + execlists_context_queue(request); +} + +static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf) +{ + uint32_t __iomem *virt; + int rem = ringbuf->size - ringbuf->tail; + + virt = ringbuf->virtual_start + ringbuf->tail; + rem /= 4; + while (rem--) + iowrite32(MI_NOOP, virt++); + + ringbuf->tail = 0; + intel_ring_update_space(ringbuf); +} + +static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes) +{ + struct intel_ringbuffer *ringbuf = req->ringbuf; + int remain_usable = ringbuf->effective_size - ringbuf->tail; + int remain_actual = ringbuf->size - ringbuf->tail; + int ret, total_bytes, wait_bytes = 0; + bool need_wrap = false; + + if (ringbuf->reserved_in_use) + total_bytes = bytes; + else + total_bytes = bytes + ringbuf->reserved_size; + + if (unlikely(bytes > remain_usable)) { + /* + * Not enough space for the basic request. So need to flush + * out the remainder and then wait for base + reserved. + */ + wait_bytes = remain_actual + total_bytes; + need_wrap = true; + } else { + if (unlikely(total_bytes > remain_usable)) { + /* + * The base request will fit but the reserved space + * falls off the end. So only need to to wait for the + * reserved size after flushing out the remainder. + */ + wait_bytes = remain_actual + ringbuf->reserved_size; + need_wrap = true; + } else if (total_bytes > ringbuf->space) { + /* No wrapping required, just waiting. */ + wait_bytes = total_bytes; + } + } + + if (wait_bytes) { + ret = logical_ring_wait_for_space(req, wait_bytes); + if (unlikely(ret)) + return ret; + + if (need_wrap) + __wrap_ring_buffer(ringbuf); + } + + return 0; +} + +/** + * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands + * + * @req: The request to start some new work for + * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. + * + * The ringbuffer might not be ready to accept the commands right away (maybe it needs to + * be wrapped, or wait a bit for the tail to be updated). This function takes care of that + * and also preallocates a request (every workload submission is still mediated through + * requests, same as it did with legacy ringbuffer submission). + * + * Return: non-zero if the ringbuffer is not ready to be written to. + */ +int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords) +{ + struct drm_i915_private *dev_priv; + int ret; + + WARN_ON(req == NULL); + dev_priv = req->ring->dev->dev_private; + + ret = i915_gem_check_wedge(&dev_priv->gpu_error, + dev_priv->mm.interruptible); + if (ret) + return ret; + + ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t)); + if (ret) + return ret; + + req->ringbuf->space -= num_dwords * sizeof(uint32_t); + return 0; +} + +int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request) +{ + /* + * The first call merely notes the reserve request and is common for + * all back ends. The subsequent localised _begin() call actually + * ensures that the reservation is available. Without the begin, if + * the request creator immediately submitted the request without + * adding any commands to it then there might not actually be + * sufficient room for the submission commands. + */ + intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST); + + return intel_logical_ring_begin(request, 0); } /** @@ -648,16 +865,15 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, * * Return: non-zero if the submission fails. */ -int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, - struct intel_engine_cs *ring, - struct intel_context *ctx, +int intel_execlists_submission(struct i915_execbuffer_params *params, struct drm_i915_gem_execbuffer2 *args, - struct list_head *vmas, - struct drm_i915_gem_object *batch_obj, - u64 exec_start, u32 dispatch_flags) + struct list_head *vmas) { + struct drm_device *dev = params->dev; + struct intel_engine_cs *ring = params->ring; struct drm_i915_private *dev_priv = dev->dev_private; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; + struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf; + u64 exec_start; int instp_mode; u32 instp_mask; int ret; @@ -688,33 +904,18 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, return -EINVAL; } - if (args->num_cliprects != 0) { - DRM_DEBUG("clip rectangles are only valid on pre-gen5\n"); - return -EINVAL; - } else { - if (args->DR4 == 0xffffffff) { - DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); - args->DR4 = 0; - } - - if (args->DR1 || args->DR4 || args->cliprects_ptr) { - DRM_DEBUG("0 cliprects but dirt in cliprects fields\n"); - return -EINVAL; - } - } - if (args->flags & I915_EXEC_GEN7_SOL_RESET) { DRM_DEBUG("sol reset is gen7 only\n"); return -EINVAL; } - ret = execlists_move_to_gpu(ringbuf, ctx, vmas); + ret = execlists_move_to_gpu(params->request, vmas); if (ret) return ret; if (ring == &dev_priv->ring[RCS] && instp_mode != dev_priv->relative_constants_mode) { - ret = intel_logical_ring_begin(ringbuf, ctx, 4); + ret = intel_logical_ring_begin(params->request, 4); if (ret) return ret; @@ -727,14 +928,17 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, dev_priv->relative_constants_mode = instp_mode; } - ret = ring->emit_bb_start(ringbuf, ctx, exec_start, dispatch_flags); + exec_start = params->batch_obj_vm_offset + + args->batch_start_offset; + + ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags); if (ret) return ret; - trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags); + trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); - i915_gem_execbuffer_move_to_active(vmas, ring); - i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj); + i915_gem_execbuffer_move_to_active(vmas, params->request); + i915_gem_execbuffer_retire_commands(params); return 0; } @@ -742,8 +946,6 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, void intel_execlists_retire_requests(struct intel_engine_cs *ring) { struct drm_i915_gem_request *req, *tmp; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; struct list_head retired_list; WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); @@ -751,9 +953,9 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) return; INIT_LIST_HEAD(&retired_list); - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_replace_init(&ring->execlist_retired_req_list, &retired_list); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { struct intel_context *ctx = req->ctx; @@ -761,8 +963,7 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) ctx->engine[ring->id].state; if (ctx_obj && (ctx != ring->default_context)) - intel_lr_context_unpin(ring, ctx); - intel_runtime_pm_put(dev_priv); + intel_lr_context_unpin(req); list_del(&req->execlist_link); i915_gem_request_unreference(req); } @@ -790,16 +991,15 @@ void intel_logical_ring_stop(struct intel_engine_cs *ring) I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING)); } -int logical_ring_flush_all_caches(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx) +int logical_ring_flush_all_caches(struct drm_i915_gem_request *req) { - struct intel_engine_cs *ring = ringbuf->ring; + struct intel_engine_cs *ring = req->ring; int ret; if (!ring->gpu_caches_dirty) return 0; - ret = ring->emit_flush(ringbuf, ctx, 0, I915_GEM_GPU_DOMAINS); + ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS); if (ret) return ret; @@ -807,331 +1007,467 @@ int logical_ring_flush_all_caches(struct intel_ringbuffer *ringbuf, return 0; } -/* - * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload - * @ringbuf: Logical Ringbuffer to advance. - * - * The tail is updated in our logical ringbuffer struct, not in the actual context. What - * really happens during submission is that the context and current tail will be placed - * on a queue waiting for the ELSP to be ready to accept a new context submission. At that - * point, the tail *inside* the context is updated and the ELSP written to. - */ -static void -intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - struct drm_i915_gem_request *request) +static int intel_lr_context_do_pin(struct intel_engine_cs *ring, + struct drm_i915_gem_object *ctx_obj, + struct intel_ringbuffer *ringbuf) { - struct intel_engine_cs *ring = ringbuf->ring; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + int ret = 0; - intel_logical_ring_advance(ringbuf); + WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); + ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, + PIN_OFFSET_BIAS | GUC_WOPCM_TOP); + if (ret) + return ret; - if (intel_ring_stopped(ring)) - return; + ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf); + if (ret) + goto unpin_ctx_obj; + + ctx_obj->dirty = true; + + /* Invalidate GuC TLB. */ + if (i915.enable_guc_submission) + I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); + + return ret; - execlists_context_queue(ring, ctx, ringbuf->tail, request); +unpin_ctx_obj: + i915_gem_object_ggtt_unpin(ctx_obj); + + return ret; } -static int intel_lr_context_pin(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_lr_context_pin(struct drm_i915_gem_request *rq) { - struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; int ret = 0; + struct intel_engine_cs *ring = rq->ring; + struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; + struct intel_ringbuffer *ringbuf = rq->ringbuf; - WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); - if (ctx->engine[ring->id].pin_count++ == 0) { - ret = i915_gem_obj_ggtt_pin(ctx_obj, - GEN8_LR_CONTEXT_ALIGN, 0); + if (rq->ctx->engine[ring->id].pin_count++ == 0) { + ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf); if (ret) goto reset_pin_count; - - ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf); - if (ret) - goto unpin_ctx_obj; - - ctx_obj->dirty = true; } - return ret; -unpin_ctx_obj: - i915_gem_object_ggtt_unpin(ctx_obj); reset_pin_count: - ctx->engine[ring->id].pin_count = 0; - + rq->ctx->engine[ring->id].pin_count = 0; return ret; } -void intel_lr_context_unpin(struct intel_engine_cs *ring, - struct intel_context *ctx) +void intel_lr_context_unpin(struct drm_i915_gem_request *rq) { - struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; + struct intel_engine_cs *ring = rq->ring; + struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state; + struct intel_ringbuffer *ringbuf = rq->ringbuf; if (ctx_obj) { WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); - if (--ctx->engine[ring->id].pin_count == 0) { + if (--rq->ctx->engine[ring->id].pin_count == 0) { intel_unpin_ringbuffer_obj(ringbuf); i915_gem_object_ggtt_unpin(ctx_obj); } } } -static int logical_ring_alloc_request(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) { - struct drm_i915_gem_request *request; - struct drm_i915_private *dev_private = ring->dev->dev_private; - int ret; + int ret, i; + struct intel_engine_cs *ring = req->ring; + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + struct i915_workarounds *w = &dev_priv->workarounds; - if (ring->outstanding_lazy_request) + if (WARN_ON_ONCE(w->count == 0)) return 0; - request = kzalloc(sizeof(*request), GFP_KERNEL); - if (request == NULL) - return -ENOMEM; + ring->gpu_caches_dirty = true; + ret = logical_ring_flush_all_caches(req); + if (ret) + return ret; - if (ctx != ring->default_context) { - ret = intel_lr_context_pin(ring, ctx); - if (ret) { - kfree(request); - return ret; - } + ret = intel_logical_ring_begin(req, w->count * 2 + 2); + if (ret) + return ret; + + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); + for (i = 0; i < w->count; i++) { + intel_logical_ring_emit(ringbuf, w->reg[i].addr); + intel_logical_ring_emit(ringbuf, w->reg[i].value); } + intel_logical_ring_emit(ringbuf, MI_NOOP); - kref_init(&request->ref); - request->ring = ring; - request->uniq = dev_private->request_uniq++; + intel_logical_ring_advance(ringbuf); - ret = i915_gem_get_seqno(ring->dev, &request->seqno); - if (ret) { - intel_lr_context_unpin(ring, ctx); - kfree(request); + ring->gpu_caches_dirty = true; + ret = logical_ring_flush_all_caches(req); + if (ret) return ret; - } - request->ctx = ctx; - i915_gem_context_reference(request->ctx); - request->ringbuf = ctx->engine[ring->id].ringbuf; - - ring->outstanding_lazy_request = request; return 0; } -static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf, - int bytes) +#define wa_ctx_emit(batch, index, cmd) \ + do { \ + int __index = (index)++; \ + if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ + return -ENOSPC; \ + } \ + batch[__index] = (cmd); \ + } while (0) + + +/* + * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after + * PIPE_CONTROL instruction. This is required for the flush to happen correctly + * but there is a slight complication as this is applied in WA batch where the + * values are only initialized once so we cannot take register value at the + * beginning and reuse it further; hence we save its value to memory, upload a + * constant value with bit21 set and then we restore it back with the saved value. + * To simplify the WA, a constant value is formed by using the default value + * of this register. This shouldn't be a problem because we are only modifying + * it for a short period and this batch in non-premptible. We can ofcourse + * use additional instructions that read the actual value of the register + * at that time and set our bit of interest but it makes the WA complicated. + * + * This WA is also required for Gen9 so extracting as a function avoids + * code duplication. + */ +static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring, + uint32_t *const batch, + uint32_t index) { - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_i915_gem_request *request; - int ret; + uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); - if (intel_ring_space(ringbuf) >= bytes) - return 0; + /* + * WaDisableLSQCROPERFforOCL:skl + * This WA is implemented in skl_init_clock_gating() but since + * this batch updates GEN8_L3SQCREG4 with default value we need to + * set this bit here to retain the WA during flush. + */ + if (IS_SKYLAKE(ring->dev) && INTEL_REVID(ring->dev) <= SKL_REVID_E0) + l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS; + + wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT)); + wa_ctx_emit(batch, index, GEN8_L3SQCREG4); + wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); + wa_ctx_emit(batch, index, 0); + + wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); + wa_ctx_emit(batch, index, GEN8_L3SQCREG4); + wa_ctx_emit(batch, index, l3sqc4_flush); + + wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); + wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE)); + wa_ctx_emit(batch, index, 0); + wa_ctx_emit(batch, index, 0); + wa_ctx_emit(batch, index, 0); + wa_ctx_emit(batch, index, 0); + + wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT)); + wa_ctx_emit(batch, index, GEN8_L3SQCREG4); + wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256); + wa_ctx_emit(batch, index, 0); + + return index; +} - list_for_each_entry(request, &ring->request_list, list) { - /* - * The request queue is per-engine, so can contain requests - * from multiple ringbuffers. Here, we must ignore any that - * aren't from the ringbuffer we're considering. - */ - struct intel_context *ctx = request->ctx; - if (ctx->engine[ring->id].ringbuf != ringbuf) - continue; +static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, + uint32_t offset, + uint32_t start_alignment) +{ + return wa_ctx->offset = ALIGN(offset, start_alignment); +} + +static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, + uint32_t offset, + uint32_t size_alignment) +{ + wa_ctx->size = offset - wa_ctx->offset; + + WARN(wa_ctx->size % size_alignment, + "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", + wa_ctx->size, size_alignment); + return 0; +} + +/** + * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA + * + * @ring: only applicable for RCS + * @wa_ctx: structure representing wa_ctx + * offset: specifies start of the batch, should be cache-aligned. This is updated + * with the offset value received as input. + * size: size of the batch in DWORDS but HW expects in terms of cachelines + * @batch: page in which WA are loaded + * @offset: This field specifies the start of the batch, it should be + * cache-aligned otherwise it is adjusted accordingly. + * Typically we only have one indirect_ctx and per_ctx batch buffer which are + * initialized at the beginning and shared across all contexts but this field + * helps us to have multiple batches at different offsets and select them based + * on a criteria. At the moment this batch always start at the beginning of the page + * and at this point we don't have multiple wa_ctx batch buffers. + * + * The number of WA applied are not known at the beginning; we use this field + * to return the no of DWORDS written. + * + * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END + * so it adds NOOPs as padding to make it cacheline aligned. + * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together + * makes a complete batch buffer. + * + * Return: non-zero if we exceed the PAGE_SIZE limit. + */ - /* Would completion of this request free enough space? */ - if (__intel_ring_space(request->tail, ringbuf->tail, - ringbuf->size) >= bytes) { - break; - } +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) +{ + uint32_t scratch_addr; + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); + + /* WaDisableCtxRestoreArbitration:bdw,chv */ + wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ + if (IS_BROADWELL(ring->dev)) { + int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index); + if (rc < 0) + return rc; + index = rc; } - if (&request->list == &ring->request_list) - return -ENOSPC; + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ + /* Actual scratch location is at 128 bytes offset */ + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; - ret = i915_wait_request(request); - if (ret) - return ret; + wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); + wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE)); + wa_ctx_emit(batch, index, scratch_addr); + wa_ctx_emit(batch, index, 0); + wa_ctx_emit(batch, index, 0); + wa_ctx_emit(batch, index, 0); + + /* Pad to end of cacheline */ + while (index % CACHELINE_DWORDS) + wa_ctx_emit(batch, index, MI_NOOP); - i915_gem_retire_requests_ring(ring); + /* + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because + * execution depends on the length specified in terms of cache lines + * in the register CTX_RCS_INDIRECT_CTX + */ - return intel_ring_space(ringbuf) >= bytes ? 0 : -ENOSPC; + return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); } -static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - int bytes) +/** + * gen8_init_perctx_bb() - initialize per ctx batch with WA + * + * @ring: only applicable for RCS + * @wa_ctx: structure representing wa_ctx + * offset: specifies start of the batch, should be cache-aligned. + * size: size of the batch in DWORDS but HW expects in terms of cachelines + * @batch: page in which WA are loaded + * @offset: This field specifies the start of this batch. + * This batch is started immediately after indirect_ctx batch. Since we ensure + * that indirect_ctx ends on a cacheline this batch is aligned automatically. + * + * The number of DWORDS written are returned using this field. + * + * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding + * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. + */ +static int gen8_init_perctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) { - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - unsigned long end; - int ret; - - ret = logical_ring_wait_request(ringbuf, bytes); - if (ret != -ENOSPC) - return ret; + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - /* Force the context submission in case we have been skipping it */ - intel_logical_ring_advance_and_submit(ringbuf, ctx, NULL); + /* WaDisableCtxRestoreArbitration:bdw,chv */ + wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); - /* With GEM the hangcheck timer should kick us out of the loop, - * leaving it early runs the risk of corrupting GEM state (due - * to running on almost untested codepaths). But on resume - * timers don't work yet, so prevent a complete hang in that - * case by choosing an insanely large timeout. */ - end = jiffies + 60 * HZ; + wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); - ret = 0; - do { - if (intel_ring_space(ringbuf) >= bytes) - break; + return wa_ctx_end(wa_ctx, *offset = index, 1); +} - msleep(1); +static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) +{ + int ret; + struct drm_device *dev = ring->dev; + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - if (dev_priv->mm.interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } + /* WaDisableCtxRestoreArbitration:skl,bxt */ + if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) || + (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) + wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - break; + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ + ret = gen8_emit_flush_coherentl3_wa(ring, batch, index); + if (ret < 0) + return ret; + index = ret; - if (time_after(jiffies, end)) { - ret = -EBUSY; - break; - } - } while (1); + /* Pad to end of cacheline */ + while (index % CACHELINE_DWORDS) + wa_ctx_emit(batch, index, MI_NOOP); - return ret; + return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); } -static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx) +static int gen9_init_perctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) { - uint32_t __iomem *virt; - int rem = ringbuf->size - ringbuf->tail; - - if (ringbuf->space < rem) { - int ret = logical_ring_wait_for_space(ringbuf, ctx, rem); - - if (ret) - return ret; + struct drm_device *dev = ring->dev; + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); + + /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */ + if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_B0)) || + (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) { + wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); + wa_ctx_emit(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); + wa_ctx_emit(batch, index, + _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); + wa_ctx_emit(batch, index, MI_NOOP); } - virt = ringbuf->virtual_start + ringbuf->tail; - rem /= 4; - while (rem--) - iowrite32(MI_NOOP, virt++); + /* WaDisableCtxRestoreArbitration:skl,bxt */ + if ((IS_SKYLAKE(dev) && (INTEL_REVID(dev) <= SKL_REVID_D0)) || + (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0))) + wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); - ringbuf->tail = 0; - intel_ring_update_space(ringbuf); + wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); - return 0; + return wa_ctx_end(wa_ctx, *offset = index, 1); } -static int logical_ring_prepare(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int bytes) +static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size) { int ret; - if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) { - ret = logical_ring_wrap_buffer(ringbuf, ctx); - if (unlikely(ret)) - return ret; + ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size)); + if (!ring->wa_ctx.obj) { + DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); + return -ENOMEM; } - if (unlikely(ringbuf->space < bytes)) { - ret = logical_ring_wait_for_space(ringbuf, ctx, bytes); - if (unlikely(ret)) - return ret; + ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0); + if (ret) { + DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", + ret); + drm_gem_object_unreference(&ring->wa_ctx.obj->base); + return ret; } return 0; } -/** - * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands - * - * @ringbuf: Logical ringbuffer. - * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. - * - * The ringbuffer might not be ready to accept the commands right away (maybe it needs to - * be wrapped, or wait a bit for the tail to be updated). This function takes care of that - * and also preallocates a request (every workload submission is still mediated through - * requests, same as it did with legacy ringbuffer submission). - * - * Return: non-zero if the ringbuffer is not ready to be written to. - */ -int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int num_dwords) +static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring) { - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - int ret; - - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - return ret; - - ret = logical_ring_prepare(ringbuf, ctx, num_dwords * sizeof(uint32_t)); - if (ret) - return ret; - - /* Preallocate the olr before touching the ring */ - ret = logical_ring_alloc_request(ring, ctx); - if (ret) - return ret; - - ringbuf->space -= num_dwords * sizeof(uint32_t); - return 0; + if (ring->wa_ctx.obj) { + i915_gem_object_ggtt_unpin(ring->wa_ctx.obj); + drm_gem_object_unreference(&ring->wa_ctx.obj->base); + ring->wa_ctx.obj = NULL; + } } -static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_init_workaround_bb(struct intel_engine_cs *ring) { - int ret, i; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - struct i915_workarounds *w = &dev_priv->workarounds; + int ret; + uint32_t *batch; + uint32_t offset; + struct page *page; + struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; - if (WARN_ON_ONCE(w->count == 0)) + WARN_ON(ring->id != RCS); + + /* update this when WA for higher Gen are added */ + if (INTEL_INFO(ring->dev)->gen > 9) { + DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", + INTEL_INFO(ring->dev)->gen); return 0; + } - ring->gpu_caches_dirty = true; - ret = logical_ring_flush_all_caches(ringbuf, ctx); - if (ret) - return ret; + /* some WA perform writes to scratch page, ensure it is valid */ + if (ring->scratch.obj == NULL) { + DRM_ERROR("scratch page not allocated for %s\n", ring->name); + return -EINVAL; + } - ret = intel_logical_ring_begin(ringbuf, ctx, w->count * 2 + 2); - if (ret) + ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE); + if (ret) { + DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); return ret; - - intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); - for (i = 0; i < w->count; i++) { - intel_logical_ring_emit(ringbuf, w->reg[i].addr); - intel_logical_ring_emit(ringbuf, w->reg[i].value); } - intel_logical_ring_emit(ringbuf, MI_NOOP); - intel_logical_ring_advance(ringbuf); + page = i915_gem_object_get_page(wa_ctx->obj, 0); + batch = kmap_atomic(page); + offset = 0; - ring->gpu_caches_dirty = true; - ret = logical_ring_flush_all_caches(ringbuf, ctx); + if (INTEL_INFO(ring->dev)->gen == 8) { + ret = gen8_init_indirectctx_bb(ring, + &wa_ctx->indirect_ctx, + batch, + &offset); + if (ret) + goto out; + + ret = gen8_init_perctx_bb(ring, + &wa_ctx->per_ctx, + batch, + &offset); + if (ret) + goto out; + } else if (INTEL_INFO(ring->dev)->gen == 9) { + ret = gen9_init_indirectctx_bb(ring, + &wa_ctx->indirect_ctx, + batch, + &offset); + if (ret) + goto out; + + ret = gen9_init_perctx_bb(ring, + &wa_ctx->per_ctx, + batch, + &offset); + if (ret) + goto out; + } + +out: + kunmap_atomic(batch); if (ret) - return ret; + lrc_destroy_wa_ctx_obj(ring); - return 0; + return ret; } static int gen8_init_common_ring(struct intel_engine_cs *ring) { struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; + u8 next_context_status_buffer_hw; + + lrc_setup_hardware_status_page(ring, + ring->default_context->engine[ring->id].state); I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff); @@ -1146,7 +1482,29 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring) _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); POSTING_READ(RING_MODE_GEN7(ring)); - ring->next_context_status_buffer = 0; + + /* + * Instead of resetting the Context Status Buffer (CSB) read pointer to + * zero, we need to read the write pointer from hardware and use its + * value because "this register is power context save restored". + * Effectively, these states have been observed: + * + * | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) | + * BDW | CSB regs not reset | CSB regs reset | + * CHT | CSB regs not reset | CSB regs not reset | + */ + next_context_status_buffer_hw = (I915_READ(RING_CONTEXT_STATUS_PTR(ring)) + & GEN8_CSB_PTR_MASK); + + /* + * When the CSB registers are reset (also after power-up / gpu reset), + * CSB write pointer is set to all 1's, which is not valid, use '5' in + * this special case, so the first element read is CSB[0]. + */ + if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) + next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); + + ring->next_context_status_buffer = next_context_status_buffer_hw; DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name); memset(&ring->hangcheck, 0, sizeof(ring->hangcheck)); @@ -1188,19 +1546,68 @@ static int gen9_init_render_ring(struct intel_engine_cs *ring) return init_workarounds_ring(ring); } -static int gen8_emit_bb_start(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, +static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) +{ + struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; + struct intel_engine_cs *ring = req->ring; + struct intel_ringbuffer *ringbuf = req->ringbuf; + const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; + int i, ret; + + ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2); + if (ret) + return ret; + + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds)); + for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { + const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); + + intel_logical_ring_emit(ringbuf, GEN8_RING_PDP_UDW(ring, i)); + intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr)); + intel_logical_ring_emit(ringbuf, GEN8_RING_PDP_LDW(ring, i)); + intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr)); + } + + intel_logical_ring_emit(ringbuf, MI_NOOP); + intel_logical_ring_advance(ringbuf); + + return 0; +} + +static int gen8_emit_bb_start(struct drm_i915_gem_request *req, u64 offset, unsigned dispatch_flags) { + struct intel_ringbuffer *ringbuf = req->ringbuf; bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); int ret; - ret = intel_logical_ring_begin(ringbuf, ctx, 4); + /* Don't rely in hw updating PDPs, specially in lite-restore. + * Ideally, we should set Force PD Restore in ctx descriptor, + * but we can't. Force Restore would be a second option, but + * it is unsafe in case of lite-restore (because the ctx is + * not idle). PML4 is allocated during ppgtt init so this is + * not needed in 48-bit.*/ + if (req->ctx->ppgtt && + (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) { + if (!USES_FULL_48BIT_PPGTT(req->i915) && + !intel_vgpu_active(req->i915->dev)) { + ret = intel_logical_ring_emit_pdps(req); + if (ret) + return ret; + } + + req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring); + } + + ret = intel_logical_ring_begin(req, 4); if (ret) return ret; /* FIXME(BDW): Address space and security selectors. */ - intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8)); + intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 | + (ppgtt<<8) | + (dispatch_flags & I915_DISPATCH_RS ? + MI_BATCH_RESOURCE_STREAMER : 0)); intel_logical_ring_emit(ringbuf, lower_32_bits(offset)); intel_logical_ring_emit(ringbuf, upper_32_bits(offset)); intel_logical_ring_emit(ringbuf, MI_NOOP); @@ -1242,18 +1649,18 @@ static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring) spin_unlock_irqrestore(&dev_priv->irq_lock, flags); } -static int gen8_emit_flush(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, +static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 invalidate_domains, u32 unused) { + struct intel_ringbuffer *ringbuf = request->ringbuf; struct intel_engine_cs *ring = ringbuf->ring; struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; uint32_t cmd; int ret; - ret = intel_logical_ring_begin(ringbuf, ctx, 4); + ret = intel_logical_ring_begin(request, 4); if (ret) return ret; @@ -1283,13 +1690,14 @@ static int gen8_emit_flush(struct intel_ringbuffer *ringbuf, return 0; } -static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, +static int gen8_emit_flush_render(struct drm_i915_gem_request *request, u32 invalidate_domains, u32 flush_domains) { + struct intel_ringbuffer *ringbuf = request->ringbuf; struct intel_engine_cs *ring = ringbuf->ring; u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; + bool vf_flush_wa; u32 flags = 0; int ret; @@ -1298,6 +1706,8 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, if (flush_domains) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; } if (invalidate_domains) { @@ -1311,10 +1721,26 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; } - ret = intel_logical_ring_begin(ringbuf, ctx, 6); + /* + * On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe + * control. + */ + vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 && + flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; + + ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6); if (ret) return ret; + if (vf_flush_wa) { + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); intel_logical_ring_emit(ringbuf, flags); intel_logical_ring_emit(ringbuf, scratch_addr); @@ -1336,9 +1762,37 @@ static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno) intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); } -static int gen8_emit_request(struct intel_ringbuffer *ringbuf, - struct drm_i915_gem_request *request) +static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency) { + + /* + * On BXT A steppings there is a HW coherency issue whereby the + * MI_STORE_DATA_IMM storing the completed request's seqno + * occasionally doesn't invalidate the CPU cache. Work around this by + * clflushing the corresponding cacheline whenever the caller wants + * the coherency to be guaranteed. Note that this cacheline is known + * to be clean at this point, since we only write it in + * bxt_a_set_seqno(), where we also do a clflush after the write. So + * this clflush in practice becomes an invalidate operation. + */ + + if (!lazy_coherency) + intel_flush_status_page(ring, I915_GEM_HWS_INDEX); + + return intel_read_status_page(ring, I915_GEM_HWS_INDEX); +} + +static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno) +{ + intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno); + + /* See bxt_a_get_seqno() explaining the reason for the clflush. */ + intel_flush_status_page(ring, I915_GEM_HWS_INDEX); +} + +static int gen8_emit_request(struct drm_i915_gem_request *request) +{ + struct intel_ringbuffer *ringbuf = request->ringbuf; struct intel_engine_cs *ring = ringbuf->ring; u32 cmd; int ret; @@ -1348,7 +1802,7 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, * used as a workaround for not being allowed to do lite * restore with HEAD==TAIL (WaIdleLiteRestore). */ - ret = intel_logical_ring_begin(ringbuf, request->ctx, 8); + ret = intel_logical_ring_begin(request, 8); if (ret) return ret; @@ -1360,11 +1814,10 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, (ring->status_page.gfx_addr + (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT))); intel_logical_ring_emit(ringbuf, 0); - intel_logical_ring_emit(ringbuf, - i915_gem_request_get_seqno(ring->outstanding_lazy_request)); + intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request)); intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT); intel_logical_ring_emit(ringbuf, MI_NOOP); - intel_logical_ring_advance_and_submit(ringbuf, request->ctx, request); + intel_logical_ring_advance_and_submit(request); /* * Here we add two extra NOOPs as padding to avoid @@ -1377,49 +1830,53 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, return 0; } -static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req) { - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; struct render_state so; - struct drm_i915_file_private *file_priv = ctx->file_priv; - struct drm_file *file = file_priv ? file_priv->file : NULL; int ret; - ret = i915_gem_render_state_prepare(ring, &so); + ret = i915_gem_render_state_prepare(req->ring, &so); if (ret) return ret; if (so.rodata == NULL) return 0; - ret = ring->emit_bb_start(ringbuf, - ctx, - so.ggtt_offset, - I915_DISPATCH_SECURE); + ret = req->ring->emit_bb_start(req, so.ggtt_offset, + I915_DISPATCH_SECURE); + if (ret) + goto out; + + ret = req->ring->emit_bb_start(req, + (so.ggtt_offset + so.aux_batch_offset), + I915_DISPATCH_SECURE); if (ret) goto out; - i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring); + i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req); - ret = __i915_add_request(ring, file, so.obj); - /* intel_logical_ring_add_request moves object to inactive if it - * fails */ out: i915_gem_render_state_fini(&so); return ret; } -static int gen8_init_rcs_context(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int gen8_init_rcs_context(struct drm_i915_gem_request *req) { int ret; - ret = intel_logical_ring_workarounds_emit(ring, ctx); + ret = intel_logical_ring_workarounds_emit(req); if (ret) return ret; - return intel_lr_context_render_state_init(ring, ctx); + ret = intel_rcs_context_init_mocs(req); + /* + * Failing to program the MOCS is non-fatal.The system will not + * run at peak performance. So generate an error and carry on. + */ + if (ret) + DRM_ERROR("MOCS failed to program: expect performance issues.\n"); + + return intel_lr_context_render_state_init(req); } /** @@ -1439,17 +1896,19 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring) intel_logical_ring_stop(ring); WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0); - i915_gem_request_assign(&ring->outstanding_lazy_request, NULL); if (ring->cleanup) ring->cleanup(ring); i915_cmd_parser_fini_ring(ring); + i915_gem_batch_pool_fini(&ring->batch_pool); if (ring->status_page.obj) { kunmap(sg_page(ring->status_page.obj->pages->sgl)); ring->status_page.obj = NULL; } + + lrc_destroy_wa_ctx_obj(ring); } static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring) @@ -1462,6 +1921,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin ring->dev = dev; INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); + i915_gem_batch_pool_init(dev, &ring->batch_pool); init_waitqueue_head(&ring->irq_queue); INIT_LIST_HEAD(&ring->execlist_queue); @@ -1472,7 +1932,21 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin if (ret) return ret; - ret = intel_lr_context_deferred_create(ring->default_context, ring); + ret = intel_lr_context_deferred_alloc(ring->default_context, ring); + if (ret) + return ret; + + /* As this is the default context, always pin it */ + ret = intel_lr_context_do_pin( + ring, + ring->default_context->engine[ring->id].state, + ring->default_context->engine[ring->id].ringbuf); + if (ret) { + DRM_ERROR( + "Failed to pin and map ringbuffer %s: %d\n", + ring->name, ret); + return ret; + } return ret; } @@ -1499,8 +1973,13 @@ static int logical_render_ring_init(struct drm_device *dev) ring->init_hw = gen8_init_render_ring; ring->init_context = gen8_init_rcs_context; ring->cleanup = intel_fini_pipe_control; - ring->get_seqno = gen8_get_seqno; - ring->set_seqno = gen8_set_seqno; + if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { + ring->get_seqno = bxt_a_get_seqno; + ring->set_seqno = bxt_a_set_seqno; + } else { + ring->get_seqno = gen8_get_seqno; + ring->set_seqno = gen8_set_seqno; + } ring->emit_request = gen8_emit_request; ring->emit_flush = gen8_emit_flush_render; ring->irq_get = gen8_logical_ring_get_irq; @@ -1508,11 +1987,28 @@ static int logical_render_ring_init(struct drm_device *dev) ring->emit_bb_start = gen8_emit_bb_start; ring->dev = dev; - ret = logical_ring_init(dev, ring); + + ret = intel_init_pipe_control(ring); if (ret) return ret; - return intel_init_pipe_control(ring); + ret = intel_init_workaround_bb(ring); + if (ret) { + /* + * We continue even if we fail to initialize WA batch + * because we only expect rare glitches but nothing + * critical to prevent us from using GPU + */ + DRM_ERROR("WA batch buffer initialization failed: %d\n", + ret); + } + + ret = logical_ring_init(dev, ring); + if (ret) { + lrc_destroy_wa_ctx_obj(ring); + } + + return ret; } static int logical_bsd_ring_init(struct drm_device *dev) @@ -1529,8 +2025,13 @@ static int logical_bsd_ring_init(struct drm_device *dev) GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT; ring->init_hw = gen8_init_common_ring; - ring->get_seqno = gen8_get_seqno; - ring->set_seqno = gen8_set_seqno; + if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { + ring->get_seqno = bxt_a_get_seqno; + ring->set_seqno = bxt_a_set_seqno; + } else { + ring->get_seqno = gen8_get_seqno; + ring->set_seqno = gen8_set_seqno; + } ring->emit_request = gen8_emit_request; ring->emit_flush = gen8_emit_flush; ring->irq_get = gen8_logical_ring_get_irq; @@ -1579,8 +2080,13 @@ static int logical_blt_ring_init(struct drm_device *dev) GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT; ring->init_hw = gen8_init_common_ring; - ring->get_seqno = gen8_get_seqno; - ring->set_seqno = gen8_set_seqno; + if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { + ring->get_seqno = bxt_a_get_seqno; + ring->set_seqno = bxt_a_set_seqno; + } else { + ring->get_seqno = gen8_get_seqno; + ring->set_seqno = gen8_set_seqno; + } ring->emit_request = gen8_emit_request; ring->emit_flush = gen8_emit_flush; ring->irq_get = gen8_logical_ring_get_irq; @@ -1604,8 +2110,13 @@ static int logical_vebox_ring_init(struct drm_device *dev) GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT; ring->init_hw = gen8_init_common_ring; - ring->get_seqno = gen8_get_seqno; - ring->set_seqno = gen8_set_seqno; + if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) { + ring->get_seqno = bxt_a_get_seqno; + ring->set_seqno = bxt_a_set_seqno; + } else { + ring->get_seqno = gen8_get_seqno; + ring->set_seqno = gen8_set_seqno; + } ring->emit_request = gen8_emit_request; ring->emit_flush = gen8_emit_flush; ring->irq_get = gen8_logical_ring_get_irq; @@ -1658,14 +2169,8 @@ int intel_logical_rings_init(struct drm_device *dev) goto cleanup_vebox_ring; } - ret = i915_gem_set_seqno(dev, ((u32)~0 - 0x1000)); - if (ret) - goto cleanup_bsd2_ring; - return 0; -cleanup_bsd2_ring: - intel_logical_ring_cleanup(&dev_priv->ring[VCS2]); cleanup_vebox_ring: intel_logical_ring_cleanup(&dev_priv->ring[VECS]); cleanup_blt_ring: @@ -1751,7 +2256,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o /* The second page of the context object contains some fields which must * be set up prior to the first execution. */ - page = i915_gem_object_get_page(ctx_obj, 1); + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); reg_state = kmap_atomic(page); /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM @@ -1767,7 +2272,8 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring); reg_state[CTX_CONTEXT_CONTROL+1] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | - CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | + CTX_CTRL_RS_CTX_ENABLE); reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); reg_state[CTX_RING_HEAD+1] = 0; reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); @@ -1792,15 +2298,27 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; reg_state[CTX_SECOND_BB_STATE+1] = 0; if (ring->id == RCS) { - /* TODO: according to BSpec, the register state context - * for CHV does not have these. OTOH, these registers do - * exist in CHV. I'm waiting for a clarification */ reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; reg_state[CTX_BB_PER_CTX_PTR+1] = 0; reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; + if (ring->wa_ctx.obj) { + struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; + uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); + + reg_state[CTX_RCS_INDIRECT_CTX+1] = + (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | + (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); + + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = + CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; + + reg_state[CTX_BB_PER_CTX_PTR+1] = + (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | + 0x01; + } } reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; @@ -1814,14 +2332,25 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); - reg_state[CTX_PDP3_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[3]->daddr); - reg_state[CTX_PDP3_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[3]->daddr); - reg_state[CTX_PDP2_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[2]->daddr); - reg_state[CTX_PDP2_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[2]->daddr); - reg_state[CTX_PDP1_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[1]->daddr); - reg_state[CTX_PDP1_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[1]->daddr); - reg_state[CTX_PDP0_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[0]->daddr); - reg_state[CTX_PDP0_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[0]->daddr); + + if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { + /* 64b PPGTT (48bit canonical) + * PDP0_DESCRIPTOR contains the base address to PML4 and + * other PDP Descriptors are ignored. + */ + ASSIGN_CTX_PML4(ppgtt, reg_state); + } else { + /* 32b PPGTT + * PDP*_DESCRIPTOR contains the base address of space supported. + * With dynamic page allocation, PDPs may not be allocated at + * this point. Point the unallocated PDPs to the scratch page + */ + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); + } + if (ring->id == RCS) { reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); reg_state[CTX_R_PWR_CLK_STATE] = GEN8_R_PWR_CLK_STATE; @@ -1862,8 +2391,7 @@ void intel_lr_context_free(struct intel_context *ctx) i915_gem_object_ggtt_unpin(ctx_obj); } WARN_ON(ctx->engine[ring->id].pin_count); - intel_destroy_ringbuffer_obj(ringbuf); - kfree(ringbuf); + intel_ringbuffer_free(ringbuf); drm_gem_object_unreference(&ctx_obj->base); } } @@ -1897,12 +2425,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, struct drm_i915_gem_object *default_ctx_obj) { struct drm_i915_private *dev_priv = ring->dev->dev_private; + struct page *page; - /* The status page is offset 0 from the default context object - * in LRC mode. */ - ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj); - ring->status_page.page_addr = - kmap(sg_page(default_ctx_obj->pages->sgl)); + /* The HWSP is part of the default context object in LRC mode. */ + ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj) + + LRC_PPHWSP_PN * PAGE_SIZE; + page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN); + ring->status_page.page_addr = kmap(page); ring->status_page.obj = default_ctx_obj; I915_WRITE(RING_HWS_PGA(ring->mmio_base), @@ -1911,7 +2440,7 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, } /** - * intel_lr_context_deferred_create() - create the LRC specific bits of a context + * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context * @ctx: LR context to create. * @ring: engine to be used with the context. * @@ -1923,10 +2452,10 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring, * * Return: non-zero on error. */ -int intel_lr_context_deferred_create(struct intel_context *ctx, + +int intel_lr_context_deferred_alloc(struct intel_context *ctx, struct intel_engine_cs *ring) { - const bool is_global_default_ctx = (ctx == ring->default_context); struct drm_device *dev = ring->dev; struct drm_i915_gem_object *ctx_obj; uint32_t context_size; @@ -1938,99 +2467,58 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, context_size = round_up(get_lr_context_size(ring), 4096); - ctx_obj = i915_gem_alloc_context_obj(dev, context_size); - if (IS_ERR(ctx_obj)) { - ret = PTR_ERR(ctx_obj); - DRM_DEBUG_DRIVER("Alloc LRC backing obj failed: %d\n", ret); - return ret; - } - - if (is_global_default_ctx) { - ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0); - if (ret) { - DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n", - ret); - drm_gem_object_unreference(&ctx_obj->base); - return ret; - } - } + /* One extra page as the sharing data between driver and GuC */ + context_size += PAGE_SIZE * LRC_PPHWSP_PN; - ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); - if (!ringbuf) { - DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n", - ring->name); - ret = -ENOMEM; - goto error_unpin_ctx; + ctx_obj = i915_gem_alloc_object(dev, context_size); + if (!ctx_obj) { + DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); + return -ENOMEM; } - ringbuf->ring = ring; - - ringbuf->size = 32 * PAGE_SIZE; - ringbuf->effective_size = ringbuf->size; - ringbuf->head = 0; - ringbuf->tail = 0; - ringbuf->last_retired_head = -1; - intel_ring_update_space(ringbuf); - - if (ringbuf->obj == NULL) { - ret = intel_alloc_ringbuffer_obj(dev, ringbuf); - if (ret) { - DRM_DEBUG_DRIVER( - "Failed to allocate ringbuffer obj %s: %d\n", - ring->name, ret); - goto error_free_rbuf; - } - - if (is_global_default_ctx) { - ret = intel_pin_and_map_ringbuffer_obj(dev, ringbuf); - if (ret) { - DRM_ERROR( - "Failed to pin and map ringbuffer %s: %d\n", - ring->name, ret); - goto error_destroy_rbuf; - } - } - + ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE); + if (IS_ERR(ringbuf)) { + ret = PTR_ERR(ringbuf); + goto error_deref_obj; } ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); if (ret) { DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); - goto error; + goto error_ringbuf; } ctx->engine[ring->id].ringbuf = ringbuf; ctx->engine[ring->id].state = ctx_obj; - if (ctx == ring->default_context) - lrc_setup_hardware_status_page(ring, ctx_obj); - else if (ring->id == RCS && !ctx->rcs_initialized) { - if (ring->init_context) { - ret = ring->init_context(ring, ctx); - if (ret) { - DRM_ERROR("ring init context: %d\n", ret); - ctx->engine[ring->id].ringbuf = NULL; - ctx->engine[ring->id].state = NULL; - goto error; - } + if (ctx != ring->default_context && ring->init_context) { + struct drm_i915_gem_request *req; + + ret = i915_gem_request_alloc(ring, + ctx, &req); + if (ret) { + DRM_ERROR("ring create req: %d\n", + ret); + goto error_ringbuf; } - ctx->rcs_initialized = true; + ret = ring->init_context(req); + if (ret) { + DRM_ERROR("ring init context: %d\n", + ret); + i915_gem_request_cancel(req); + goto error_ringbuf; + } + i915_add_request_no_flush(req); } - return 0; -error: - if (is_global_default_ctx) - intel_unpin_ringbuffer_obj(ringbuf); -error_destroy_rbuf: - intel_destroy_ringbuffer_obj(ringbuf); -error_free_rbuf: - kfree(ringbuf); -error_unpin_ctx: - if (is_global_default_ctx) - i915_gem_object_ggtt_unpin(ctx_obj); +error_ringbuf: + intel_ringbuffer_free(ringbuf); +error_deref_obj: drm_gem_object_unreference(&ctx_obj->base); + ctx->engine[ring->id].ringbuf = NULL; + ctx->engine[ring->id].state = NULL; return ret; } @@ -2056,7 +2544,7 @@ void intel_lr_context_reset(struct drm_device *dev, WARN(1, "Failed get_pages for context obj\n"); continue; } - page = i915_gem_object_get_page(ctx_obj, 1); + page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); reg_state = kmap_atomic(page); reg_state[CTX_RING_HEAD+1] = 0;