Re: [Mesa-dev] [PATCH 06/11] radeonsi: add a separate dirty mask for prefetches

Nicolai Hähnle Sun, 06 Aug 2017 23:48:08 -0700

On 07.08.2017 00:20, Marek Olšák wrote:

From: Marek Olšák <marek.ol...@amd.com>


so that we don't rely on si_pm4_state_enabled_and_changed, allowing us
to move prefetches after draw calls.
---
  src/gallium/drivers/radeonsi/si_cp_dma.c        | 16 ++++++++--------
  src/gallium/drivers/radeonsi/si_descriptors.c   |  3 +--
  src/gallium/drivers/radeonsi/si_hw_context.c    | 16 ++++++++++++++--
  src/gallium/drivers/radeonsi/si_pipe.h          | 10 +++++++++-
  src/gallium/drivers/radeonsi/si_state_draw.c    |  2 +-
  src/gallium/drivers/radeonsi/si_state_shaders.c | 16 ++++++++++++++--
  6 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 24fa6fd..21202b3 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -444,38 +444,38 @@ static void cik_prefetch_shader_async(struct si_context 
*sctx,
  {
        struct pipe_resource *bo = &state->bo[0]->b.b;
        assert(state->nbo == 1);

cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);

void cik_emit_prefetch_L2(struct si_context *sctx)

  {
        /* Prefetch shaders and VBO descriptors to TC L2. */
-       if (si_pm4_state_enabled_and_changed(sctx, ls))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-       if (si_pm4_state_enabled_and_changed(sctx, hs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-       if (si_pm4_state_enabled_and_changed(sctx, es))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
                cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-       if (si_pm4_state_enabled_and_changed(sctx, gs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-       if (si_pm4_state_enabled_and_changed(sctx, vs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.vs);

/* Vertex buffer descriptors are uploaded uncached, so prefetch

         * them right after the VS binary. */
-       if (sctx->vertex_buffer_pointer_dirty) {
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) {
                cik_prefetch_TC_L2_async(sctx, 
&sctx->vertex_buffers.buffer->b.b,
                                         sctx->vertex_buffers.buffer_offset,
                                         
sctx->vertex_elements->desc_list_byte_size);
        }
-       if (si_pm4_state_enabled_and_changed(sctx, ps))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.ps);

- sctx->prefetch_L2 = false;

+       sctx->prefetch_L2_mask = 0;
  }

void si_init_cp_dma_functions(struct si_context *sctx)

  {
        sctx->b.clear_buffer = si_clear_buffer;
  }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 917b0e1..43f1792 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1169,24 +1169,23 @@ bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
                                              (struct 
r600_resource*)vb->buffer.resource,
                                              RADEON_USAGE_READ, 
RADEON_PRIO_VERTEX_BUFFER);
                }
        }

/* Don't flush the const cache. It would have a very negative effect

         * on performance (confirmed by testing). New descriptors are always
         * uploaded to a fresh new buffer, so I don't think flushing the const
         * cache is needed. */
        si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
-       if (sctx->b.chip_class >= CIK)
-               sctx->prefetch_L2 = true;
        sctx->vertex_buffers_dirty = false;
        sctx->vertex_buffer_pointer_dirty = true;
+       sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
        return true;
  }

/* CONSTANT BUFFERS */static unsigned

  si_const_and_shader_buffer_descriptors_idx(unsigned shader)
  {
        return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index 756b159..3582cd7 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -209,22 +209,34 @@ void si_begin_new_cs(struct si_context *ctx)
                si_pm4_emit(ctx, ctx->init_config_gs_rings);

if (ctx->ce_preamble_ib)

                si_ce_enable_loads(ctx->ce_preamble_ib);
        else if (ctx->ce_ib)
                si_ce_enable_loads(ctx->ce_ib);

if (ctx->ce_ib)

                si_ce_restore_all_descriptors_at_ib_start(ctx);

- if (ctx->b.chip_class >= CIK)

-               ctx->prefetch_L2 = true;
+       if (ctx->queued.named.ls)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+       if (ctx->queued.named.hs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+       if (ctx->queued.named.es)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+       if (ctx->queued.named.gs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+       if (ctx->queued.named.vs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+       if (ctx->queued.named.ps)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+       if (ctx->vertex_buffers.buffer)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;

/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */

        ctx->framebuffer.dirty_cbufs =
                u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
        /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
        ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
        /* This should always be marked as dirty to set the framebuffer scissor
         * at least. */
        si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index d213886..62b64e1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -61,20 +61,28 @@
  /* Framebuffer caches. */
  #define SI_CONTEXT_FLUSH_AND_INV_DB   (R600_CONTEXT_PRIVATE_FLAG << 7)
  #define SI_CONTEXT_FLUSH_AND_INV_CB   (R600_CONTEXT_PRIVATE_FLAG << 8)
  /* Engine synchronization. */
  #define SI_CONTEXT_VS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 9)
  #define SI_CONTEXT_PS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 10)
  #define SI_CONTEXT_CS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 11)
  #define SI_CONTEXT_VGT_FLUSH          (R600_CONTEXT_PRIVATE_FLAG << 12)
  #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)

+#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)

+#define SI_PREFETCH_LS                 (1 << 1)
+#define SI_PREFETCH_HS                 (1 << 2)
+#define SI_PREFETCH_ES                 (1 << 3)
+#define SI_PREFETCH_GS                 (1 << 4)
+#define SI_PREFETCH_VS                 (1 << 5)
+#define SI_PREFETCH_PS                 (1 << 6)
+
  #define SI_MAX_BORDER_COLORS  4096
  #define SIX_BITS              0x3F

struct si_compute;

  struct hash_table;
  struct u_suballocator;

struct si_screen {

        struct r600_common_screen       b;
        unsigned                        gs_table_depth;
@@ -272,25 +280,25 @@ struct si_context {
        struct si_shader_ctx_state      fixed_func_tcs_shader;
        struct r600_resource            *wait_mem_scratch;
        unsigned                        wait_mem_number;

struct radeon_winsys_cs *ce_ib;

        struct radeon_winsys_cs         *ce_preamble_ib;
        struct r600_resource            *ce_ram_saved_buffer;
        struct u_suballocator           *ce_suballocator;
        unsigned                        ce_ram_saved_offset;
        uint16_t                        total_ce_ram_allocated;
+       uint16_t                        prefetch_L2_mask;
        bool                            ce_need_synchronization:1;

bool gfx_flush_in_progress:1;

        bool                            compute_is_busy:1;
-       bool                            prefetch_L2:1;

/* Atoms (direct states). */

        union si_state_atoms            atoms;
        unsigned                        dirty_atoms; /* mask */
        /* PM4 states (precomputed immutable states) */
        unsigned                        dirty_states;
        union si_state                  queued;
        union si_state                  emitted;

/* Atom declarations. */

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 3f933fe..c78450c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1339,21 +1339,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
        /* GFX9 scissor bug workaround. There is also a more efficient but
         * more involved alternative workaround. */
        if (sctx->b.chip_class == GFX9 &&
            si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;

/* Flush caches before the first state atom, which does L2 prefetches. */

        if (sctx->b.flags)
                si_emit_cache_flush(sctx);

- if (sctx->prefetch_L2)

+       if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)

Maybe guard all the places where the mask is set by the chip_classinstead, to be consistent and save the check here.

I also wonder if it might not be better to keep prefetching as an atom,and check the mask when there's anything to be prefetched. But in termsof code, both are fine, so whether you want to take a look at thenumbers or not, patches 1-7 are


Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>

                cik_emit_prefetch_L2(sctx);

/* Emit state atoms. */

        mask = sctx->dirty_atoms;
        while (mask) {
                struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];

atom->emit(&sctx->b, atom);

        }
        sctx->dirty_atoms = 0;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index cb5a23e..5b8f907 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -3300,22 +3300,34 @@ bool si_update_shaders(struct si_context *sctx)
        if (si_pm4_state_enabled_and_changed(sctx, ls) ||
            si_pm4_state_enabled_and_changed(sctx, hs) ||
            si_pm4_state_enabled_and_changed(sctx, es) ||
            si_pm4_state_enabled_and_changed(sctx, gs) ||
            si_pm4_state_enabled_and_changed(sctx, vs) ||
            si_pm4_state_enabled_and_changed(sctx, ps)) {
                if (!si_update_spi_tmpring_size(sctx))
                        return false;
        }

- if (sctx->b.chip_class >= CIK)

-               sctx->prefetch_L2 = true;
+       if (sctx->b.chip_class >= CIK) {
+               if (si_pm4_state_enabled_and_changed(sctx, ls))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+               if (si_pm4_state_enabled_and_changed(sctx, hs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+               if (si_pm4_state_enabled_and_changed(sctx, es))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+               if (si_pm4_state_enabled_and_changed(sctx, gs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+               if (si_pm4_state_enabled_and_changed(sctx, vs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+               if (si_pm4_state_enabled_and_changed(sctx, ps))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+       }

sctx->do_update_shaders = false;

        return true;
  }

static void si_emit_scratch_state(struct si_context *sctx,

                                  struct r600_atom *atom)
  {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 06/11] radeonsi: add a separate dirty mask for prefetches

Reply via email to