From: Nicolai Hähnle <nicolai.haeh...@amd.com> Not sure if it's possible to avoid programming the block size twice (once for the userdata and once for the dispatch).
Since the shaders are compiled with a pessimistic upper limit on the number of registers, asynchronously compiling variants may be worth considering in the future if we observe the shaders to be dispatched with small block sizes. --- I think this is sufficient to support variable group sizes on radeonsi, but it's completely untested. Do you keep the latest version of your series in a public repository somewhere? src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++- src/gallium/drivers/radeonsi/si_shader.c | 29 ++++++++++++++++++++--------- src/gallium/drivers/radeonsi/si_shader.h | 4 +++- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 5041761..26e096c 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx, for (i = 0; i < 3; ++i) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG)); radeon_emit(cs, (va + 4 * i)); radeon_emit(cs, (va + 4 * i) >> 32); radeon_emit(cs, (grid_size_reg >> 2) + i); radeon_emit(cs, 0); } } else { + struct si_compute *program = sctx->cs_shader_state.program; + bool variable_group_size = + program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3); radeon_emit(cs, info->grid[0]); radeon_emit(cs, info->grid[1]); radeon_emit(cs, info->grid[2]); + if (variable_group_size) { + radeon_emit(cs, info->block[0]); + radeon_emit(cs, info->block[1]); + radeon_emit(cs, info->block[2]); + } } } static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; unsigned waves_per_threadgroup = DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0b7de18..730ee21 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1783,30 +1783,35 @@ static void declare_system_value( case TGSI_SEMANTIC_GRID_SIZE: value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE); break; case TGSI_SEMANTIC_BLOCK_SIZE: { LLVMValueRef values[3]; unsigned i; unsigned *properties = ctx->shader->selector->info.properties; - unsigned sizes[3] = { - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] - }; - for (i = 0; i < 3; ++i) - values[i] = lp_build_const_int32(gallivm, sizes[i]); + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { + unsigned sizes[3] = { + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] + }; + + for (i = 0; i < 3; ++i) + values[i] = lp_build_const_int32(gallivm, sizes[i]); - value = lp_build_gather_values(gallivm, values, 3); + value = lp_build_gather_values(gallivm, values, 3); + } else { + value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE); + } break; } case TGSI_SEMANTIC_BLOCK_ID: value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID); break; case TGSI_SEMANTIC_THREAD_ID: value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID); break; @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context *ctx) for (i = 0; i < num_return_sgprs; i++) returns[i] = ctx->i32; for (; i < num_returns; i++) returns[i] = ctx->f32; } break; case PIPE_SHADER_COMPUTE: params[SI_PARAM_GRID_SIZE] = v3i32; + params[SI_PARAM_BLOCK_SIZE] = v3i32; params[SI_PARAM_BLOCK_ID] = v3i32; last_sgpr = SI_PARAM_BLOCK_ID; params[SI_PARAM_THREAD_ID] = v3i32; num_params = SI_PARAM_THREAD_ID + 1; break; default: assert(0 && "unimplemented shader"); return; } @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context *ctx) S_0286D0_LINEAR_CENTROID_ENA(1) | S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); } else if (ctx->type == PIPE_SHADER_COMPUTE) { const unsigned *properties = shader->selector->info.properties; unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - assert(max_work_group_size); + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = 2048; + } radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, "amdgpu-max-work-group-size", max_work_group_size); } shader->info.num_input_sgprs = 0; shader->info.num_input_vgprs = 0; for (i = 0; i <= last_sgpr; ++i) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fc1b22d..afdb3f5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -120,21 +120,22 @@ enum { /* GS limits */ SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1, /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, /* CS only */ SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, - SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3 + SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, + SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 }; /* LLVM function parameter indices */ enum { SI_PARAM_RW_BUFFERS, SI_PARAM_CONST_BUFFERS, SI_PARAM_SAMPLERS, SI_PARAM_IMAGES, SI_PARAM_SHADER_BUFFERS, SI_NUM_RESOURCE_PARAMS, @@ -210,20 +211,21 @@ enum { SI_PARAM_POS_Y_FLOAT, SI_PARAM_POS_Z_FLOAT, SI_PARAM_POS_W_FLOAT, SI_PARAM_FRONT_FACE, SI_PARAM_ANCILLARY, SI_PARAM_SAMPLE_COVERAGE, SI_PARAM_POS_FIXED_PT, /* CS only parameters */ SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, + SI_PARAM_BLOCK_SIZE, SI_PARAM_BLOCK_ID, SI_PARAM_THREAD_ID, SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ }; /* SI-specific system values. */ enum { TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT, TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev