Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Nicolai Hähnle Thu, 18 May 2017 04:29:13 -0700

On 18.05.2017 12:43, Marek Olšák wrote:

On Thu, May 18, 2017 at 12:41 PM, Marek Olšák <mar...@gmail.com> wrote:

On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote:

On 17.05.2017 21:38, Marek Olšák wrote:


From: Marek Olšák <marek.ol...@amd.com>

This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85
++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h         | 18 +++++-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c
b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
                        return NULL;
                }
        }

        return program;
 }

 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context*)ctx;
-       sctx->cs_shader_state.program = (struct si_compute*)state;
+       struct si_compute *program = (struct si_compute*)state;
+
+       sctx->cs_shader_state.program = program;
+       if (!program)
+               return;
+
+       /* Wait because we need active slot usage masks. */
+       if (program->ir_type == PIPE_SHADER_IR_TGSI)
+               util_queue_fence_wait(&program->ready);
+
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+
SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+
program->active_const_and_shader_buffers);
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+                                 SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+                                 program->active_samplers_and_images);
 }

 static void si_set_global_binding(
        struct pipe_context *ctx, unsigned first, unsigned n,
        struct pipe_resource **resources,
        uint32_t **handles)
 {
        unsigned i;
        struct si_context *sctx = (struct si_context*)ctx;
        struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
        bool cs_regalloc_hang =
                (sctx->b.chip_class == SI ||
                 sctx->b.family == CHIP_BONAIRE ||
                 sctx->b.family == CHIP_KABINI) &&
                info->block[0] * info->block[1] * info->block[2] > 256;

        if (cs_regalloc_hang)
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                                 SI_CONTEXT_CS_PARTIAL_FLUSH;

-       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-               util_queue_fence_wait(&program->ready);
-
-               if (program->shader.compilation_failed)
-                       return;
-       }
+       if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+           program->shader.compilation_failed)
+               return;

        si_decompress_compute_textures(sctx);

        /* Add buffer sizes for memory checking in need_cs_space. */
        r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
        /* TODO: add the scratch buffer */

        if (info->indirect) {
                r600_context_add_resource_size(ctx, info->indirect);

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct
si_descriptors *desc,
        }
 }

 static void si_release_descriptors(struct si_descriptors *desc)
 {
        r600_resource_reference(&desc->buffer, NULL);
        FREE(desc->list);
 }

 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
unsigned size,
-                        unsigned *out_offset, struct r600_resource
**out_buf) {
+                        unsigned *out_offset, struct r600_resource
**out_buf)
+{
        uint64_t va;

        u_suballocator_alloc(sctx->ce_suballocator, size,
-                            sctx->screen->b.info.tcc_cache_line_size,
-                            out_offset, (struct pipe_resource**)out_buf);
+                            si_optimal_tcc_alignment(sctx, size),
+                            (unsigned*)out_offset,



The extra cast of out_offset is unnecessary.

+                            (struct pipe_resource**)out_buf);
        if (!out_buf)
                        return false;

        va = (*out_buf)->gpu_address + *out_offset;

        radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
        radeon_emit(sctx->ce_ib, ce_offset);
        radeon_emit(sctx->ce_ib, size / 4);
        radeon_emit(sctx->ce_ib, va);
        radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
        radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
        radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
                        CONTEXT_CONTROL_LOAD_CE_RAM(1));
        radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 }

 static bool si_upload_descriptors(struct si_context *sctx,
                                  struct si_descriptors *desc,
                                  struct r600_atom * atom)
 {
-       unsigned list_size = desc->num_elements * desc->element_dw_size *
4;
+       unsigned slot_size = desc->element_dw_size * 4;
+       unsigned first_slot_offset = desc->first_active_slot * slot_size;
+       unsigned upload_size = desc->num_active_slots * slot_size;
+
+       if (!upload_size)
+               return true;



The early-out here means that desc->num_active_slots *does* control what is
written to CE RAM, contrary to what its descriptive comment says. It needs
to be moved further down.


True, but I think it doesn't matter, because dirty_mask stays dirty
and the descriptors will be uploaded when there is a shader using
them.


Is this enough?


Yes, makes sense to me.

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>


diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 58d2723..89588c3 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -136,7 +136,7 @@ static bool si_ce_upload(struct si_context *sctx,
unsigned ce_offset, unsigned s

        u_suballocator_alloc(sctx->ce_suballocator, size,
                             si_optimal_tcc_alignment(sctx, size),
-                            (unsigned*)out_offset,
+                            out_offset,
                             (struct pipe_resource**)out_buf);
        if (!out_buf)
                        return false;
@@ -204,6 +204,10 @@ static bool si_upload_descriptors(struct si_context *sctx,
        unsigned first_slot_offset = desc->first_active_slot * slot_size;
        unsigned upload_size = desc->num_active_slots * slot_size;

+       /* Skip the upload if no shader is using the descriptors. dirty_mask
+        * will stay dirty and the descriptors will be uploaded when there is
+        * a shader using them.
+        */
        if (!upload_size)
                return true;

Marek



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Reply via email to