On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > From: Nicolai Hähnle <nicolai.haeh...@amd.com> > > Not sure if it's possible to avoid programming the block size twice (once for > the userdata and once for the dispatch). > > Since the shaders are compiled with a pessimistic upper limit on the number of > registers, asynchronously compiling variants may be worth considering in the > future if we observe the shaders to be dispatched with small block sizes. > --- > I think this is sufficient to support variable group sizes on radeonsi, but > it's completely untested. Do you keep the latest version of your series in a > public repository somewhere? > > src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++- > src/gallium/drivers/radeonsi/si_shader.c | 29 ++++++++++++++++++++--------- > src/gallium/drivers/radeonsi/si_shader.h | 4 +++- > 3 files changed, 32 insertions(+), 11 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index 5041761..26e096c 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx, > for (i = 0; i < 3; ++i) { > radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); > radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | > COPY_DATA_DST_SEL(COPY_DATA_REG)); > radeon_emit(cs, (va + 4 * i)); > radeon_emit(cs, (va + 4 * i) >> 32); > radeon_emit(cs, (grid_size_reg >> 2) + i); > radeon_emit(cs, 0); > } > } else { > + struct si_compute *program = sctx->cs_shader_state.program; > + bool variable_group_size = > + > program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] > == 0; > > - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); > + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size > ? 6 : 3); > radeon_emit(cs, info->grid[0]); > radeon_emit(cs, info->grid[1]); > radeon_emit(cs, info->grid[2]); > + if (variable_group_size) { > + radeon_emit(cs, info->block[0]); > + radeon_emit(cs, info->block[1]); > + radeon_emit(cs, info->block[2]); > + } > } > } > > static void si_emit_dispatch_packets(struct si_context *sctx, > const struct pipe_grid_info *info) > { > struct radeon_winsys_cs *cs = sctx->b.gfx.cs; > bool render_cond_bit = sctx->b.render_cond && > !sctx->b.render_cond_force_off; > unsigned waves_per_threadgroup = > DIV_ROUND_UP(info->block[0] * info->block[1] * > info->block[2], 64); > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index 0b7de18..730ee21 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -1783,30 +1783,35 @@ static void declare_system_value( > > case TGSI_SEMANTIC_GRID_SIZE: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE); > break; > > case TGSI_SEMANTIC_BLOCK_SIZE: > { > LLVMValueRef values[3]; > unsigned i; > unsigned *properties = ctx->shader->selector->info.properties; > - unsigned sizes[3] = { > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] > - }; > > - for (i = 0; i < 3; ++i) > - values[i] = lp_build_const_int32(gallivm, sizes[i]); > + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { > + unsigned sizes[3] = { > + > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], > + > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], > + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] > + }; > + > + for (i = 0; i < 3; ++i) > + values[i] = lp_build_const_int32(gallivm, > sizes[i]); > > - value = lp_build_gather_values(gallivm, values, 3); > + value = lp_build_gather_values(gallivm, values, 3); > + } else { > + value = LLVMGetParam(radeon_bld->main_fn, > SI_PARAM_BLOCK_SIZE); > + } > break; > } > > case TGSI_SEMANTIC_BLOCK_ID: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID); > break; > > case TGSI_SEMANTIC_THREAD_ID: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID); > break; > @@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context > *ctx) > > for (i = 0; i < num_return_sgprs; i++) > returns[i] = ctx->i32; > for (; i < num_returns; i++) > returns[i] = ctx->f32; > } > break; > > case PIPE_SHADER_COMPUTE: > params[SI_PARAM_GRID_SIZE] = v3i32; > + params[SI_PARAM_BLOCK_SIZE] = v3i32; > params[SI_PARAM_BLOCK_ID] = v3i32; > last_sgpr = SI_PARAM_BLOCK_ID; > > params[SI_PARAM_THREAD_ID] = v3i32; > num_params = SI_PARAM_THREAD_ID + 1; > break; > default: > assert(0 && "unimplemented shader"); > return; > } > @@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context > *ctx) > S_0286D0_LINEAR_CENTROID_ENA(1) | > S_0286D0_FRONT_FACE_ENA(1) | > S_0286D0_POS_FIXED_PT_ENA(1)); > } else if (ctx->type == PIPE_SHADER_COMPUTE) { > const unsigned *properties = > shader->selector->info.properties; > unsigned max_work_group_size = > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] > * > > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; > > - assert(max_work_group_size); > + if (!max_work_group_size) { > + /* This is a variable group size compute shader, > + * compile it for the maximum possible group size. > + */ > + max_work_group_size = 2048; > + }
The VGPR limit is 32. That's too limiting. I'd prefer it if GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or 1024. That should move the limit to 128 or 64, respectively. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev