From: Marek Olšák <marek.ol...@amd.com>

40% is the decrease in the LGKM counter (which includes SMEM too)
for the GFX9 LSHS stage.

This will make the LDS size slightly larger, but I wasn't able to increase
the patch stride without corruption, so I'm increasing the vertex stride.
---
 src/gallium/drivers/radeonsi/si_shader.c        | 8 ++++----
 src/gallium/drivers/radeonsi/si_shader.h        | 3 ++-
 src/gallium/drivers/radeonsi/si_state_draw.c    | 2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +++++++
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 5dc12d87243..43ba23ff494 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -410,28 +410,28 @@ static LLVMValueRef get_num_tcs_out_vertices(struct 
si_shader_context *ctx)
 
        return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 }
 
 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 {
        unsigned stride;
 
        switch (ctx->type) {
        case PIPE_SHADER_VERTEX:
-               stride = 
util_last_bit64(ctx->shader->selector->outputs_written);
-               return LLVMConstInt(ctx->i32, stride * 4, 0);
+               stride = ctx->shader->selector->lshs_vertex_stride / 4;
+               return LLVMConstInt(ctx->i32, stride, 0);
 
        case PIPE_SHADER_TESS_CTRL:
                if (ctx->screen->info.chip_class >= GFX9 &&
                    ctx->shader->is_monolithic) {
-                       stride = 
util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
-                       return LLVMConstInt(ctx->i32, stride * 4, 0);
+                       stride = 
ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+                       return LLVMConstInt(ctx->i32, stride, 0);
                }
                return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
        default:
                assert(0);
                return NULL;
        }
 }
 
 static LLVMValueRef get_instance_index_for_fetch(
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index ffe13b761d9..3f3294eee37 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -361,21 +361,22 @@ struct si_shader_selector {
 
        /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
        unsigned        type;
        bool            vs_needs_prolog;
        bool            force_correct_derivs_after_kill;
        unsigned        pa_cl_vs_out_cntl;
        ubyte           clipdist_mask;
        ubyte           culldist_mask;
 
        /* ES parameters. */
-       unsigned        esgs_itemsize;
+       unsigned        esgs_itemsize; /* vertex stride */
+       unsigned        lshs_vertex_stride;
 
        /* GS parameters. */
        unsigned        gs_input_verts_per_prim;
        unsigned        gs_output_prim;
        unsigned        gs_max_out_vertices;
        unsigned        gs_num_invocations;
        unsigned        max_gs_stream; /* count - 1 */
        unsigned        gsvs_vertex_size;
        unsigned        max_gsvs_emit_size;
        unsigned        enabled_streamout_buffer_mask;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index f35f73a37ce..d901401f0bb 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -127,21 +127,21 @@ static bool si_emit_derived_tess_state(struct si_context 
*sctx,
                num_tcs_outputs = util_last_bit64(tcs->outputs_written);
                num_tcs_output_cp = 
tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
                num_tcs_patch_outputs = 
util_last_bit64(tcs->patch_outputs_written);
        } else {
                /* No TCS. Route varyings from LS to TES. */
                num_tcs_outputs = num_tcs_inputs;
                num_tcs_output_cp = num_tcs_input_cp;
                num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
        }
 
-       input_vertex_size = num_tcs_inputs * 16;
+       input_vertex_size = ls->lshs_vertex_stride;
        output_vertex_size = num_tcs_outputs * 16;
 
        input_patch_size = num_tcs_input_cp * input_vertex_size;
 
        pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
        output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs 
* 16;
 
        /* Ensure that we only need one wave per SIMD so we don't need to check
         * resource usage. Also ensures that the number of tcs in and out
         * vertices per threadgroup are at most 256.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ffc8821df09..32d804c3fc5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2153,20 +2153,27 @@ static void *si_create_shader_selector(struct 
pipe_context *ctx,
                                sel->outputs_written |=
                                        1ull << 
si_shader_io_get_unique_index(name, index, false);
                                sel->outputs_written_before_ps |=
                                        1ull << 
si_shader_io_get_unique_index(name, index, true);
                                break;
                        case TGSI_SEMANTIC_EDGEFLAG:
                                break;
                        }
                }
                sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+               sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+               /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+                * will start on a different bank. (except for the maximum 
32*16).
+                */
+               if (sel->lshs_vertex_stride < 32*16)
+                       sel->lshs_vertex_stride += 4;
 
                /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
                 * conflicts, i.e. each vertex will start at a different bank.
                 */
                if (sctx->chip_class >= GFX9)
                        sel->esgs_itemsize += 4;
 
                assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
                break;
 
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to