From: Marek Olšák <marek.ol...@amd.com> 40% is the decrease in the LGKM counter (which includes SMEM too) for the GFX9 LSHS stage.
This will make the LDS size slightly larger, but I wasn't able to increase the patch stride without corruption, so I'm increasing the vertex stride. --- src/gallium/drivers/radeonsi/si_shader.c | 8 ++++---- src/gallium/drivers/radeonsi/si_shader.h | 3 ++- src/gallium/drivers/radeonsi/si_state_draw.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +++++++ 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 5dc12d87243..43ba23ff494 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -410,28 +410,28 @@ static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx) return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6); } static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) { unsigned stride; switch (ctx->type) { case PIPE_SHADER_VERTEX: - stride = util_last_bit64(ctx->shader->selector->outputs_written); - return LLVMConstInt(ctx->i32, stride * 4, 0); + stride = ctx->shader->selector->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->i32, stride, 0); case PIPE_SHADER_TESS_CTRL: if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) { - stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written); - return LLVMConstInt(ctx->i32, stride * 4, 0); + stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4; + return LLVMConstInt(ctx->i32, stride, 0); } return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); default: assert(0); return NULL; } } static LLVMValueRef get_instance_index_for_fetch( diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ffe13b761d9..3f3294eee37 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -361,21 +361,22 @@ struct si_shader_selector { /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ unsigned type; bool vs_needs_prolog; bool force_correct_derivs_after_kill; unsigned pa_cl_vs_out_cntl; ubyte clipdist_mask; ubyte culldist_mask; /* ES parameters. */ - unsigned esgs_itemsize; + unsigned esgs_itemsize; /* vertex stride */ + unsigned lshs_vertex_stride; /* GS parameters. */ unsigned gs_input_verts_per_prim; unsigned gs_output_prim; unsigned gs_max_out_vertices; unsigned gs_num_invocations; unsigned max_gs_stream; /* count - 1 */ unsigned gsvs_vertex_size; unsigned max_gsvs_emit_size; unsigned enabled_streamout_buffer_mask; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index f35f73a37ce..d901401f0bb 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -127,21 +127,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx, num_tcs_outputs = util_last_bit64(tcs->outputs_written); num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); } else { /* No TCS. Route varyings from LS to TES. */ num_tcs_outputs = num_tcs_inputs; num_tcs_output_cp = num_tcs_input_cp; num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */ } - input_vertex_size = num_tcs_inputs * 16; + input_vertex_size = ls->lshs_vertex_stride; output_vertex_size = num_tcs_outputs * 16; input_patch_size = num_tcs_input_cp * input_vertex_size; pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; /* Ensure that we only need one wave per SIMD so we don't need to check * resource usage. Also ensures that the number of tcs in and out * vertices per threadgroup are at most 256. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index ffc8821df09..32d804c3fc5 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2153,20 +2153,27 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->outputs_written |= 1ull << si_shader_io_get_unique_index(name, index, false); sel->outputs_written_before_ps |= 1ull << si_shader_io_get_unique_index(name, index, true); break; case TGSI_SEMANTIC_EDGEFLAG: break; } } sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; + sel->lshs_vertex_stride = sel->esgs_itemsize; + + /* Add 1 dword to reduce LDS bank conflicts, so that each vertex + * will start on a different bank. (except for the maximum 32*16). + */ + if (sel->lshs_vertex_stride < 32*16) + sel->lshs_vertex_stride += 4; /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank * conflicts, i.e. each vertex will start at a different bank. */ if (sctx->chip_class >= GFX9) sel->esgs_itemsize += 4; assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0); break; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev