From: Nicolai Hähnle <nicolai.haeh...@amd.com>
Not sure if it's possible to avoid programming the block size twice
(once for
the userdata and once for the dispatch).
Since the shaders are compiled with a pessimistic upper limit on the
number of
registers, asynchronously compiling variants may be worth
considering in the
future if we observe the shaders to be dispatched with small block
sizes.
---
I think this is sufficient to support variable group sizes on
radeonsi, but
it's completely untested. Do you keep the latest version of your
series in a
public repository somewhere?
src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
src/gallium/drivers/radeonsi/si_shader.c | 29
++++++++++++++++++++---------
src/gallium/drivers/radeonsi/si_shader.h | 4 +++-
3 files changed, 32 insertions(+), 11 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c
b/src/gallium/drivers/radeonsi/si_compute.c
index 5041761..26e096c 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct
si_context *sctx,
for (i = 0; i < 3; ++i) {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs,
COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
COPY_DATA_DST_SEL(COPY_DATA_REG));
radeon_emit(cs, (va + 4 * i));
radeon_emit(cs, (va + 4 * i) >> 32);
radeon_emit(cs, (grid_size_reg >> 2) + i);
radeon_emit(cs, 0);
}
} else {
+ struct si_compute *program =
sctx->cs_shader_state.program;
+ bool variable_group_size =
+
program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
== 0;
- radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+ radeon_set_sh_reg_seq(cs, grid_size_reg,
variable_group_size ? 6 : 3);
radeon_emit(cs, info->grid[0]);
radeon_emit(cs, info->grid[1]);
radeon_emit(cs, info->grid[2]);
+ if (variable_group_size) {
+ radeon_emit(cs, info->block[0]);
+ radeon_emit(cs, info->block[1]);
+ radeon_emit(cs, info->block[2]);
+ }
}
}
static void si_emit_dispatch_packets(struct si_context *sctx,
const struct pipe_grid_info
*info)
{
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
bool render_cond_bit = sctx->b.render_cond &&
!sctx->b.render_cond_force_off;
unsigned waves_per_threadgroup =
DIV_ROUND_UP(info->block[0] * info->block[1] *
info->block[2], 64);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c
b/src/gallium/drivers/radeonsi/si_shader.c
index 0b7de18..730ee21 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1783,30 +1783,35 @@ static void declare_system_value(
case TGSI_SEMANTIC_GRID_SIZE:
value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_GRID_SIZE);
break;
case TGSI_SEMANTIC_BLOCK_SIZE:
{
LLVMValueRef values[3];
unsigned i;
unsigned *properties =
ctx->shader->selector->info.properties;
- unsigned sizes[3] = {
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
- properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
- };
- for (i = 0; i < 3; ++i)
- values[i] = lp_build_const_int32(gallivm,
sizes[i]);
+ if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
!= 0) {
+ unsigned sizes[3] = {
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+ };
+
+ for (i = 0; i < 3; ++i)
+ values[i] =
lp_build_const_int32(gallivm, sizes[i]);
- value = lp_build_gather_values(gallivm, values, 3);
+ value = lp_build_gather_values(gallivm,
values, 3);
+ } else {
+ value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_BLOCK_SIZE);
+ }
break;
}
case TGSI_SEMANTIC_BLOCK_ID:
value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_BLOCK_ID);
break;
case TGSI_SEMANTIC_THREAD_ID:
value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_THREAD_ID);
break;
@@ -5705,20 +5710,21 @@ static void create_function(struct
si_shader_context *ctx)
for (i = 0; i < num_return_sgprs; i++)
returns[i] = ctx->i32;
for (; i < num_returns; i++)
returns[i] = ctx->f32;
}
break;
case PIPE_SHADER_COMPUTE:
params[SI_PARAM_GRID_SIZE] = v3i32;
+ params[SI_PARAM_BLOCK_SIZE] = v3i32;
params[SI_PARAM_BLOCK_ID] = v3i32;
last_sgpr = SI_PARAM_BLOCK_ID;
params[SI_PARAM_THREAD_ID] = v3i32;
num_params = SI_PARAM_THREAD_ID + 1;
break;
default:
assert(0 && "unimplemented shader");
return;
}
@@ -5741,21 +5747,26 @@ static void create_function(struct
si_shader_context *ctx)
S_0286D0_LINEAR_CENTROID_ENA(1) |
S_0286D0_FRONT_FACE_ENA(1) |
S_0286D0_POS_FIXED_PT_ENA(1));
} else if (ctx->type == PIPE_SHADER_COMPUTE) {
const unsigned *properties =
shader->selector->info.properties;
unsigned max_work_group_size =
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
- assert(max_work_group_size);
+ if (!max_work_group_size) {
+ /* This is a variable group size compute
shader,
+ * compile it for the maximum possible group
size.
+ */
+ max_work_group_size = 2048;
+ }