From: Marek Olšák <marek.ol...@amd.com> I'd like to be able to move the prefetch call site around. --- src/gallium/drivers/radeonsi/si_cp_dma.c | 7 +++---- src/gallium/drivers/radeonsi/si_descriptors.c | 2 +- src/gallium/drivers/radeonsi/si_hw_context.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 3 ++- src/gallium/drivers/radeonsi/si_state.h | 1 - src/gallium/drivers/radeonsi/si_state_draw.c | 3 +++ src/gallium/drivers/radeonsi/si_state_shaders.c | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index e42f260..9f0e506 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -443,21 +443,21 @@ static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) { if (state) { struct pipe_resource *bo = &state->bo[0]->b.b; assert(state->nbo == 1); cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); } } -static void cik_emit_prefetch_L2(struct si_context *sctx, struct r600_atom *atom) +void cik_emit_prefetch_L2(struct si_context *sctx) { /* Prefetch shaders and VBO descriptors to TC L2. */ if (si_pm4_state_changed(sctx, ls)) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); if (si_pm4_state_changed(sctx, hs)) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); if (si_pm4_state_changed(sctx, es)) cik_prefetch_shader_async(sctx, sctx->queued.named.es); if (si_pm4_state_changed(sctx, gs)) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); @@ -466,19 +466,18 @@ static void cik_emit_prefetch_L2(struct si_context *sctx, struct r600_atom *atom /* Vertex buffer descriptors are uploaded uncached, so prefetch * them right after the VS binary. */ if (sctx->vertex_buffer_pointer_dirty) { cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b, sctx->vertex_buffers.buffer_offset, sctx->vertex_elements->desc_list_byte_size); } if (si_pm4_state_changed(sctx, ps)) cik_prefetch_shader_async(sctx, sctx->queued.named.ps); + + sctx->prefetch_L2 = false; } void si_init_cp_dma_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_clear_buffer; - - si_init_atom(sctx, &sctx->prefetch_L2, &sctx->atoms.s.prefetch_L2, - cik_emit_prefetch_L2); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index ea5b89e..917b0e1 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1170,21 +1170,21 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } } /* Don't flush the const cache. It would have a very negative effect * on performance (confirmed by testing). New descriptors are always * uploaded to a fresh new buffer, so I don't think flushing the const * cache is needed. */ si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom); if (sctx->b.chip_class >= CIK) - si_mark_atom_dirty(sctx, &sctx->prefetch_L2); + sctx->prefetch_L2 = true; sctx->vertex_buffers_dirty = false; sctx->vertex_buffer_pointer_dirty = true; return true; } /* CONSTANT BUFFERS */ static unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader) diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index f2dfcc7..756b159 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -210,21 +210,21 @@ void si_begin_new_cs(struct si_context *ctx) if (ctx->ce_preamble_ib) si_ce_enable_loads(ctx->ce_preamble_ib); else if (ctx->ce_ib) si_ce_enable_loads(ctx->ce_ib); if (ctx->ce_ib) si_ce_restore_all_descriptors_at_ib_start(ctx); if (ctx->b.chip_class >= CIK) - si_mark_atom_dirty(ctx, &ctx->prefetch_L2); + ctx->prefetch_L2 = true; /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */ ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL; /* This should always be marked as dirty to set the framebuffer scissor * at least. */ si_mark_atom_dirty(ctx, &ctx->framebuffer.atom); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 1984299..d213886 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -276,31 +276,31 @@ struct si_context { struct radeon_winsys_cs *ce_ib; struct radeon_winsys_cs *ce_preamble_ib; struct r600_resource *ce_ram_saved_buffer; struct u_suballocator *ce_suballocator; unsigned ce_ram_saved_offset; uint16_t total_ce_ram_allocated; bool ce_need_synchronization:1; bool gfx_flush_in_progress:1; bool compute_is_busy:1; + bool prefetch_L2:1; /* Atoms (direct states). */ union si_state_atoms atoms; unsigned dirty_atoms; /* mask */ /* PM4 states (precomputed immutable states) */ unsigned dirty_states; union si_state queued; union si_state emitted; /* Atom declarations. */ - struct r600_atom prefetch_L2; struct si_framebuffer framebuffer; struct si_sample_locs msaa_sample_locs; struct r600_atom db_render_state; struct r600_atom msaa_config; struct si_sample_mask sample_mask; struct r600_atom cb_render_state; unsigned last_cb_target_mask; struct si_blend_color blend_color; struct r600_atom clip_regs; struct si_clip_state clip_state; @@ -477,20 +477,21 @@ void si_resource_copy_region(struct pipe_context *ctx, SI_CPDMA_SKIP_SYNC_BEFORE | \ SI_CPDMA_SKIP_GFX_SYNC | \ SI_CPDMA_SKIP_BO_LIST_UPDATE) void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); +void cik_emit_prefetch_L2(struct si_context *sctx); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct r600_common_context *ctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index acc8fb7..9fbede7 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -134,21 +134,20 @@ union si_state { struct si_pm4_state *ps; } named; struct si_pm4_state *array[0]; }; #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *)) union si_state_atoms { struct { /* The order matters. */ - struct r600_atom *prefetch_L2; struct r600_atom *render_cond; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ struct r600_atom *framebuffer; struct r600_atom *msaa_sample_locs; struct r600_atom *db_render_state; struct r600_atom *msaa_config; struct r600_atom *sample_mask; struct r600_atom *cb_render_state; struct r600_atom *blend_color; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 5254645..3f933fe 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1339,20 +1339,23 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* GFX9 scissor bug workaround. There is also a more efficient but * more involved alternative workaround. */ if (sctx->b.chip_class == GFX9 && si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; /* Flush caches before the first state atom, which does L2 prefetches. */ if (sctx->b.flags) si_emit_cache_flush(sctx); + if (sctx->prefetch_L2) + cik_emit_prefetch_L2(sctx); + /* Emit state atoms. */ mask = sctx->dirty_atoms; while (mask) { struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; atom->emit(&sctx->b, atom); } sctx->dirty_atoms = 0; /* Emit states. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index daf4af5..0dd6402 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -3301,21 +3301,21 @@ bool si_update_shaders(struct si_context *sctx) si_pm4_state_changed(sctx, hs) || si_pm4_state_changed(sctx, es) || si_pm4_state_changed(sctx, gs) || si_pm4_state_changed(sctx, vs) || si_pm4_state_changed(sctx, ps)) { if (!si_update_spi_tmpring_size(sctx)) return false; } if (sctx->b.chip_class >= CIK) - si_mark_atom_dirty(sctx, &sctx->prefetch_L2); + sctx->prefetch_L2 = true; sctx->do_update_shaders = false; return true; } static void si_emit_scratch_state(struct si_context *sctx, struct r600_atom *atom) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev