From: Marek Olšák <marek.ol...@amd.com> so that we don't rely on si_pm4_state_enabled_and_changed, allowing us to move prefetches after draw calls. --- src/gallium/drivers/radeonsi/si_cp_dma.c | 16 ++++++++-------- src/gallium/drivers/radeonsi/si_descriptors.c | 3 +-- src/gallium/drivers/radeonsi/si_hw_context.c | 16 ++++++++++++++-- src/gallium/drivers/radeonsi/si_pipe.h | 10 +++++++++- src/gallium/drivers/radeonsi/si_state_draw.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 16 ++++++++++++++-- 6 files changed, 47 insertions(+), 16 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 24fa6fd..21202b3 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -444,38 +444,38 @@ static void cik_prefetch_shader_async(struct si_context *sctx, { struct pipe_resource *bo = &state->bo[0]->b.b; assert(state->nbo == 1); cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); } void cik_emit_prefetch_L2(struct si_context *sctx) { /* Prefetch shaders and VBO descriptors to TC L2. */ - if (si_pm4_state_enabled_and_changed(sctx, ls)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (si_pm4_state_enabled_and_changed(sctx, hs)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (si_pm4_state_enabled_and_changed(sctx, es)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (si_pm4_state_enabled_and_changed(sctx, gs)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (si_pm4_state_enabled_and_changed(sctx, vs)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); /* Vertex buffer descriptors are uploaded uncached, so prefetch * them right after the VS binary. */ - if (sctx->vertex_buffer_pointer_dirty) { + if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) { cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b, sctx->vertex_buffers.buffer_offset, sctx->vertex_elements->desc_list_byte_size); } - if (si_pm4_state_enabled_and_changed(sctx, ps)) + if (sctx->prefetch_L2_mask & SI_PREFETCH_PS) cik_prefetch_shader_async(sctx, sctx->queued.named.ps); - sctx->prefetch_L2 = false; + sctx->prefetch_L2_mask = 0; } void si_init_cp_dma_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_clear_buffer; } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 917b0e1..43f1792 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1169,24 +1169,23 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) (struct r600_resource*)vb->buffer.resource, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } } /* Don't flush the const cache. It would have a very negative effect * on performance (confirmed by testing). New descriptors are always * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); - if (sctx->b.chip_class >= CIK) - sctx->prefetch_L2 = true; sctx->vertex_buffers_dirty = false; sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; return true; } /* CONSTANT BUFFERS */ static unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader) { return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS + diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 756b159..3582cd7 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -209,22 +209,34 @@ void si_begin_new_cs(struct si_context *ctx) si_pm4_emit(ctx, ctx->init_config_gs_rings); if (ctx->ce_preamble_ib) si_ce_enable_loads(ctx->ce_preamble_ib); else if (ctx->ce_ib) si_ce_enable_loads(ctx->ce_ib); if (ctx->ce_ib) si_ce_restore_all_descriptors_at_ib_start(ctx); - if (ctx->b.chip_class >= CIK) - ctx->prefetch_L2 = true; + if (ctx->queued.named.ls) + ctx->prefetch_L2_mask |= SI_PREFETCH_LS; + if (ctx->queued.named.hs) + ctx->prefetch_L2_mask |= SI_PREFETCH_HS; + if (ctx->queued.named.es) + ctx->prefetch_L2_mask |= SI_PREFETCH_ES; + if (ctx->queued.named.gs) + ctx->prefetch_L2_mask |= SI_PREFETCH_GS; + if (ctx->queued.named.vs) + ctx->prefetch_L2_mask |= SI_PREFETCH_VS; + if (ctx->queued.named.ps) + ctx->prefetch_L2_mask |= SI_PREFETCH_PS; + if (ctx->vertex_buffers.buffer) + ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */ ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL; /* This should always be marked as dirty to set the framebuffer scissor * at least. */ si_mark_atom_dirty(ctx, &ctx->framebuffer.atom); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d213886..62b64e1 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -61,20 +61,28 @@ /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7) #define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8) /* Engine synchronization. */ #define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) #define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) #define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) #define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12) #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13) +#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0) +#define SI_PREFETCH_LS (1 << 1) +#define SI_PREFETCH_HS (1 << 2) +#define SI_PREFETCH_ES (1 << 3) +#define SI_PREFETCH_GS (1 << 4) +#define SI_PREFETCH_VS (1 << 5) +#define SI_PREFETCH_PS (1 << 6) + #define SI_MAX_BORDER_COLORS 4096 #define SIX_BITS 0x3F struct si_compute; struct hash_table; struct u_suballocator; struct si_screen { struct r600_common_screen b; unsigned gs_table_depth; @@ -272,25 +280,25 @@ struct si_context { struct si_shader_ctx_state fixed_func_tcs_shader; struct r600_resource *wait_mem_scratch; unsigned wait_mem_number; struct radeon_winsys_cs *ce_ib; struct radeon_winsys_cs *ce_preamble_ib; struct r600_resource *ce_ram_saved_buffer; struct u_suballocator *ce_suballocator; unsigned ce_ram_saved_offset; uint16_t total_ce_ram_allocated; + uint16_t prefetch_L2_mask; bool ce_need_synchronization:1; bool gfx_flush_in_progress:1; bool compute_is_busy:1; - bool prefetch_L2:1; /* Atoms (direct states). */ union si_state_atoms atoms; unsigned dirty_atoms; /* mask */ /* PM4 states (precomputed immutable states) */ unsigned dirty_states; union si_state queued; union si_state emitted; /* Atom declarations. */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 3f933fe..c78450c 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1339,21 +1339,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* GFX9 scissor bug workaround. There is also a more efficient but * more involved alternative workaround. */ if (sctx->b.chip_class == GFX9 && si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; /* Flush caches before the first state atom, which does L2 prefetches. */ if (sctx->b.flags) si_emit_cache_flush(sctx); - if (sctx->prefetch_L2) + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx); /* Emit state atoms. */ mask = sctx->dirty_atoms; while (mask) { struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; atom->emit(&sctx->b, atom); } sctx->dirty_atoms = 0; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index cb5a23e..5b8f907 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3300,22 +3300,34 @@ bool si_update_shaders(struct si_context *sctx) if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) || si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) { if (!si_update_spi_tmpring_size(sctx)) return false; } - if (sctx->b.chip_class >= CIK) - sctx->prefetch_L2 = true; + if (sctx->b.chip_class >= CIK) { + if (si_pm4_state_enabled_and_changed(sctx, ls)) + sctx->prefetch_L2_mask |= SI_PREFETCH_LS; + if (si_pm4_state_enabled_and_changed(sctx, hs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_HS; + if (si_pm4_state_enabled_and_changed(sctx, es)) + sctx->prefetch_L2_mask |= SI_PREFETCH_ES; + if (si_pm4_state_enabled_and_changed(sctx, gs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_GS; + if (si_pm4_state_enabled_and_changed(sctx, vs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_VS; + if (si_pm4_state_enabled_and_changed(sctx, ps)) + sctx->prefetch_L2_mask |= SI_PREFETCH_PS; + } sctx->do_update_shaders = false; return true; } static void si_emit_scratch_state(struct si_context *sctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev