On 02.01.2017 21:18, Marek Olšák wrote:
From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/si_cp_dma.c     | 12 +++++++++
 src/gallium/drivers/radeonsi/si_pipe.h       |  2 ++
 src/gallium/drivers/radeonsi/si_state_draw.c | 37 +++++++++++++++++++++++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 653021e..13b901b 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -360,14 +360,26 @@ void si_copy_buffer(struct si_context *sctx,
                                         &is_first);

        if (tc_l2_flag)
                r600_resource(dst)->TC_L2_dirty = true;

        /* If it's not a prefetch... */
        if (dst_offset != src_offset)
                sctx->b.num_cp_dma_calls++;
 }

+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
+                             uint64_t offset, unsigned size)
+{
+       assert(sctx->b.chip_class >= CIK);
+
+       si_copy_buffer(sctx, buf, buf, offset, offset, size,
+                      SI_CPDMA_SKIP_CHECK_CS_SPACE |
+                      SI_CPDMA_SKIP_SYNC_AFTER |
+                      SI_CPDMA_SKIP_SYNC_BEFORE |
+                      SI_CPDMA_SKIP_GFX_SYNC);
+}
+
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
        sctx->b.clear_buffer = si_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index dc37c8d..c0a4636 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -374,20 +374,22 @@ void si_resource_copy_region(struct pipe_context *ctx,
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER       (1 << 1) /* don't wait for DMA after 
the copy */
 #define SI_CPDMA_SKIP_SYNC_BEFORE      (1 << 2) /* don't wait for DMA before 
the copy (RAW hazards) */
 #define SI_CPDMA_SKIP_GFX_SYNC         (1 << 3) /* don't flush caches and 
don't wait for PS/CS */

 void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
                    unsigned user_flags);
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
+                             uint64_t offset, unsigned size);
 void si_init_cp_dma_functions(struct si_context *sctx);

 /* si_debug.c */
 void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct r600_common_context *ctx,
                        struct radeon_saved_cs *saved, enum ring_type ring);
 bool si_replace_shader(unsigned num, struct radeon_shader_binary *binary);

 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index b3f664e..7b75602 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -930,20 +930,31 @@ void si_ce_pre_draw_synchronization(struct si_context 
*sctx)
 void si_ce_post_draw_synchronization(struct si_context *sctx)
 {
        if (sctx->ce_need_synchronization) {
                radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 
0));
                radeon_emit(sctx->b.gfx.cs, 0);

                sctx->ce_need_synchronization = false;
        }
 }

+static void cik_prefetch_shader_async(struct si_context *sctx,
+                                     struct si_pm4_state *state)
+{
+       if (state) {
+               struct pipe_resource *bo = &state->bo[0]->b.b;
+               assert(state->nbo == 1);
+
+               cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+       }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
        struct pipe_index_buffer ib = {};
        unsigned mask, dirty_fb_counter, dirty_tex_counter, rast_prim;

        if (likely(!info->indirect)) {
                /* SI-CI treat instance_count==0 as instance_count==1. There is
                 * no workaround for indirect draws, but we can at least skip
@@ -1107,24 +1118,48 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)

        si_need_cs_space(sctx);

        /* Since we've called r600_context_add_resource_size for vertex buffers,
         * this must be called after si_need_cs_space, because we must let
         * need_cs_space flush before we add buffers to the buffer list.
         */
        if (!si_upload_vertex_buffer_descriptors(sctx))
                return;

-       /* Flushed caches prior to emitting states. */
+       /* Flushed caches prior to prefetching shaders. */
        if (sctx->b.flags)
                si_emit_cache_flush(sctx);

+       /* Prefetch shaders and VBO descriptors to TC L2. */
+       if (sctx->b.chip_class >= CIK) {
+               if (si_pm4_state_changed(sctx, ls))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+               if (si_pm4_state_changed(sctx, hs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+               if (si_pm4_state_changed(sctx, es))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+               if (si_pm4_state_changed(sctx, gs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+               if (si_pm4_state_changed(sctx, vs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+
+               /* Vertex buffer descriptors are uploaded uncached, so prefetch
+                * them right after the VS binary. */
+               if (sctx->vertex_buffers.pointer_dirty) {
+                       cik_prefetch_TC_L2_async(sctx, 
&sctx->vertex_buffers.buffer->b.b,
+                                               
sctx->vertex_buffers.buffer_offset,
+                                               sctx->vertex_elements->count * 
16);
+               }

Logically this should come directly after the API vertex shader, right? So you're basically putting this in a sub-optimal place for tessellation/geometry pipelines to simplify the code? Okay, that may be a reasonable trade-off given how rare they are.

For the series:

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>

+               if (si_pm4_state_changed(sctx, ps))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+       }
+
        /* Emit states. */
        mask = sctx->dirty_atoms;
        while (mask) {
                struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];

                atom->emit(&sctx->b, atom);
        }
        sctx->dirty_atoms = 0;

        si_pm4_emit_dirty(sctx);

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to