Re: [Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Samuel Pitoiset Fri, 09 Sep 2016 09:07:08 -0700


On 09/09/2016 10:12 AM, Nicolai Hähnle wrote:

From: Nicolai Hähnle <nicolai.haeh...@amd.com>

Not sure if it's possible to avoid programming the block size twice (once for
the userdata and once for the dispatch).

Since the shaders are compiled with a pessimistic upper limit on the number of
registers, asynchronously compiling variants may be worth considering in the
future if we observe the shaders to be dispatched with small block sizes.
---
I think this is sufficient to support variable group sizes on radeonsi, but
it's completely untested. Do you keep the latest version of your series in a
public repository somewhere?


Yes, you can find the current version here:
https://cgit.freedesktop.org/~hakzsam/mesa/log/?h=arb_compute_variable_group_size

The next one will be in the arb_compute_variable_group_size_v1 branch(once I have fixed all the things locally).


 src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
 src/gallium/drivers/radeonsi/si_shader.c  | 29 ++++++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_shader.h  |  4 +++-
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 5041761..26e096c 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
                for (i = 0; i < 3; ++i) {
                        radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
                        radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
                                        COPY_DATA_DST_SEL(COPY_DATA_REG));
                        radeon_emit(cs, (va +  4 * i));
                        radeon_emit(cs, (va + 4 * i) >> 32);
                        radeon_emit(cs, (grid_size_reg >> 2) + i);
                        radeon_emit(cs, 0);
                }
        } else {
+               struct si_compute *program = sctx->cs_shader_state.program;
+               bool variable_group_size =
+                       
program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] 
== 0;

-               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+               radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 
6 : 3);
                radeon_emit(cs, info->grid[0]);
                radeon_emit(cs, info->grid[1]);
                radeon_emit(cs, info->grid[2]);
+               if (variable_group_size) {
+                       radeon_emit(cs, info->block[0]);
+                       radeon_emit(cs, info->block[1]);
+                       radeon_emit(cs, info->block[2]);
+               }
        }
 }

 static void si_emit_dispatch_packets(struct si_context *sctx,
                                      const struct pipe_grid_info *info)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        bool render_cond_bit = sctx->b.render_cond && 
!sctx->b.render_cond_force_off;
        unsigned waves_per_threadgroup =
                DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 
64);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 0b7de18..730ee21 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1783,30 +1783,35 @@ static void declare_system_value(

        case TGSI_SEMANTIC_GRID_SIZE:
                value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
                break;

        case TGSI_SEMANTIC_BLOCK_SIZE:
        {
                LLVMValueRef values[3];
                unsigned i;
                unsigned *properties = ctx->shader->selector->info.properties;
-               unsigned sizes[3] = {
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
-               };

-               for (i = 0; i < 3; ++i)
-                       values[i] = lp_build_const_int32(gallivm, sizes[i]);
+               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+                       unsigned sizes[3] = {
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+                       };
+
+                       for (i = 0; i < 3; ++i)
+                               values[i] = lp_build_const_int32(gallivm, 
sizes[i]);

-               value = lp_build_gather_values(gallivm, values, 3);
+                       value = lp_build_gather_values(gallivm, values, 3);
+               } else {
+                       value = LLVMGetParam(radeon_bld->main_fn, 
SI_PARAM_BLOCK_SIZE);
+               }
                break;
        }

        case TGSI_SEMANTIC_BLOCK_ID:
                value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
                break;

        case TGSI_SEMANTIC_THREAD_ID:
                value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
                break;
@@ -5705,20 +5710,21 @@ static void create_function(struct si_shader_context 
*ctx)

                        for (i = 0; i < num_return_sgprs; i++)
                                returns[i] = ctx->i32;
                        for (; i < num_returns; i++)
                                returns[i] = ctx->f32;
                }
                break;

        case PIPE_SHADER_COMPUTE:
                params[SI_PARAM_GRID_SIZE] = v3i32;
+               params[SI_PARAM_BLOCK_SIZE] = v3i32;
                params[SI_PARAM_BLOCK_ID] = v3i32;
                last_sgpr = SI_PARAM_BLOCK_ID;

                params[SI_PARAM_THREAD_ID] = v3i32;
                num_params = SI_PARAM_THREAD_ID + 1;
                break;
        default:
                assert(0 && "unimplemented shader");
                return;
        }
@@ -5741,21 +5747,26 @@ static void create_function(struct si_shader_context 
*ctx)
                                          S_0286D0_LINEAR_CENTROID_ENA(1) |
                                          S_0286D0_FRONT_FACE_ENA(1) |
                                          S_0286D0_POS_FIXED_PT_ENA(1));
        } else if (ctx->type == PIPE_SHADER_COMPUTE) {
                const unsigned *properties = shader->selector->info.properties;
                unsigned max_work_group_size =
                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
                               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];

-               assert(max_work_group_size);
+               if (!max_work_group_size) {
+                       /* This is a variable group size compute shader,
+                        * compile it for the maximum possible group size.
+                        */
+                       max_work_group_size = 2048;
+               }

                radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
                                          "amdgpu-max-work-group-size",
                                          max_work_group_size);
        }

        shader->info.num_input_sgprs = 0;
        shader->info.num_input_vgprs = 0;

        for (i = 0; i <= last_sgpr; ++i)
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index fc1b22d..afdb3f5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -120,21 +120,22 @@ enum {
        /* GS limits */
        SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
        SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,

        /* PS only */
        SI_SGPR_ALPHA_REF       = SI_NUM_RESOURCE_SGPRS,
        SI_PS_NUM_USER_SGPR,

        /* CS only */
        SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
-       SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3
+       SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
+       SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
 };

 /* LLVM function parameter indices */
 enum {
        SI_PARAM_RW_BUFFERS,
        SI_PARAM_CONST_BUFFERS,
        SI_PARAM_SAMPLERS,
        SI_PARAM_IMAGES,
        SI_PARAM_SHADER_BUFFERS,
        SI_NUM_RESOURCE_PARAMS,
@@ -210,20 +211,21 @@ enum {
        SI_PARAM_POS_Y_FLOAT,
        SI_PARAM_POS_Z_FLOAT,
        SI_PARAM_POS_W_FLOAT,
        SI_PARAM_FRONT_FACE,
        SI_PARAM_ANCILLARY,
        SI_PARAM_SAMPLE_COVERAGE,
        SI_PARAM_POS_FIXED_PT,

        /* CS only parameters */
        SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
+       SI_PARAM_BLOCK_SIZE,
        SI_PARAM_BLOCK_ID,
        SI_PARAM_THREAD_ID,

        SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
 };

 /* SI-specific system values. */
 enum {
        TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT,
        TGSI_SEMANTIC_DEFAULT_TESSINNER_SI,

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Reply via email to