From: Marek Olšák <marek.ol...@amd.com> Process new SET packets in parallel with previous draw calls.
This decreases [CP busy / SPI busy] by a very tiny amount (verified with GRBM perf counters), and probably increases FPS by a very tiny amount for apps that do pipeline syncs often. --- src/gallium/drivers/radeonsi/si_state_draw.c | 54 ++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ae48115..06a18c1 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1173,30 +1173,31 @@ static bool si_cache_flush_and_prefetch(struct si_context *sctx) */ if (!si_upload_graphics_shader_descriptors(sctx)) return false; if (sctx->prefetch_L2) cik_emit_prefetch_L2(sctx); return true; } -static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info) +static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, + unsigned skip_atom_mask) { /* Emit state atoms. */ - unsigned mask = sctx->dirty_atoms; + unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; while (mask) { struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; atom->emit(&sctx->b, atom); } - sctx->dirty_atoms = 0; + sctx->dirty_atoms &= skip_atom_mask; /* Emit states. */ mask = sctx->dirty_states; while (mask) { unsigned i = u_bit_scan(&mask); struct si_pm4_state *state = sctx->queued.array[i]; if (!state || sctx->emitted.array[i] == state) continue; @@ -1384,23 +1385,64 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) */ if (!si_upload_vertex_buffer_descriptors(sctx)) return; /* GFX9 scissor bug workaround. There is also a more efficient but * more involved alternative workaround. */ if (sctx->b.chip_class == GFX9 && si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - if (!si_cache_flush_and_prefetch(sctx)) - return; - si_emit_all_states(sctx, info); + /* Use an optimal packet order based on whether we need to sync the pipeline. */ + if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* If we have to wait for idle, set all states first, so that all + * SET packets are processed in parallel with previous draw calls. + * Sequence: + * - process SET packets except SET_SH packets for shader pointers + * - flush caches and wait for previous draw calls + * - start CE dumps (might already be ongoing if there is no CE-DE barrier) + * - start prefetches + * - process SET_SH packets for shader pointers + * - wait for CE dumps + * - draw + */ + struct r600_atom *shader_pointers = &sctx->shader_userdata.atom; + + /* Emit all states except shader pointers. */ + si_emit_all_states(sctx, info, 1 << shader_pointers->id); + + if (!si_cache_flush_and_prefetch(sctx)) + return; + + /* Set shader pointers last. */ + if (si_is_atom_dirty(sctx, shader_pointers)) { + shader_pointers->emit(&sctx->b, NULL); + sctx->dirty_atoms = 0; + } + } else { + /* If we don't wait for idle, do CE dumps and start prefetches + * first, so that they are being done in parallel with all SET + * packets. Sequence: + * - flush caches + * - start CE dumps (might already be ongoing if CE is ahead) + * - start prefetches + * - process SET packets + * - wait for CE dumps + * - draw + */ + if (!si_cache_flush_and_prefetch(sctx)) + return; + si_emit_all_states(sctx, info, 0); + } si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); si_ce_post_draw_synchronization(sctx); if (sctx->trace_buf) si_trace_emit(sctx); /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev