From: Marek Olšák <marek.ol...@amd.com>

Process new SET packets in parallel with previous draw calls.

This decreases [CP busy / SPI busy] by a very tiny amount (verified with
GRBM perf counters), and probably increases FPS by a very tiny amount
for apps that do pipeline syncs often.
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 54 ++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index ae48115..06a18c1 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1173,30 +1173,31 @@ static bool si_cache_flush_and_prefetch(struct 
si_context *sctx)
         */
        if (!si_upload_graphics_shader_descriptors(sctx))
                return false;
 
        if (sctx->prefetch_L2)
                cik_emit_prefetch_L2(sctx);
 
        return true;
 }
 
-static void si_emit_all_states(struct si_context *sctx, const struct 
pipe_draw_info *info)
+static void si_emit_all_states(struct si_context *sctx, const struct 
pipe_draw_info *info,
+                              unsigned skip_atom_mask)
 {
        /* Emit state atoms. */
-       unsigned mask = sctx->dirty_atoms;
+       unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
        while (mask) {
                struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
                atom->emit(&sctx->b, atom);
        }
-       sctx->dirty_atoms = 0;
+       sctx->dirty_atoms &= skip_atom_mask;
 
        /* Emit states. */
        mask = sctx->dirty_states;
        while (mask) {
                unsigned i = u_bit_scan(&mask);
                struct si_pm4_state *state = sctx->queued.array[i];
 
                if (!state || sctx->emitted.array[i] == state)
                        continue;
 
@@ -1384,23 +1385,64 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
         */
        if (!si_upload_vertex_buffer_descriptors(sctx))
                return;
 
        /* GFX9 scissor bug workaround. There is also a more efficient but
         * more involved alternative workaround. */
        if (sctx->b.chip_class == GFX9 &&
            si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
-       if (!si_cache_flush_and_prefetch(sctx))
-               return;
-       si_emit_all_states(sctx, info);
+       /* Use an optimal packet order based on whether we need to sync the 
pipeline. */
+       if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+                                     SI_CONTEXT_FLUSH_AND_INV_DB |
+                                     SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                     SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+               /* If we have to wait for idle, set all states first, so that 
all
+                * SET packets are processed in parallel with previous draw 
calls.
+                * Sequence:
+                * - process SET packets except SET_SH packets for shader 
pointers
+                * - flush caches and wait for previous draw calls
+                * - start CE dumps (might already be ongoing if there is no 
CE-DE barrier)
+                * - start prefetches
+                * - process SET_SH packets for shader pointers
+                * - wait for CE dumps
+                * - draw
+                */
+               struct r600_atom *shader_pointers = &sctx->shader_userdata.atom;
+
+               /* Emit all states except shader pointers. */
+               si_emit_all_states(sctx, info, 1 << shader_pointers->id);
+
+               if (!si_cache_flush_and_prefetch(sctx))
+                       return;
+
+               /* Set shader pointers last. */
+               if (si_is_atom_dirty(sctx, shader_pointers)) {
+                       shader_pointers->emit(&sctx->b, NULL);
+                       sctx->dirty_atoms = 0;
+               }
+       } else {
+               /* If we don't wait for idle, do CE dumps and start prefetches
+                * first, so that they are being done in parallel with all SET
+                * packets. Sequence:
+                * - flush caches
+                * - start CE dumps (might already be ongoing if CE is ahead)
+                * - start prefetches
+                * - process SET packets
+                * - wait for CE dumps
+                * - draw
+                */
+               if (!si_cache_flush_and_prefetch(sctx))
+                       return;
+               si_emit_all_states(sctx, info, 0);
+       }
 
        si_ce_pre_draw_synchronization(sctx);
        si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
        si_ce_post_draw_synchronization(sctx);
 
        if (sctx->trace_buf)
                si_trace_emit(sctx);
 
        /* Workaround for a VGT hang when streamout is enabled.
         * It must be done after drawing. */
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to