Re: [Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Nicolai Hähnle Fri, 09 Sep 2016 09:12:23 -0700

On 09.09.2016 18:07, Samuel Pitoiset wrote:



On 09/09/2016 02:37 PM, Ilia Mirkin wrote:

On Fri, Sep 9, 2016 at 8:29 AM, Marek Olšák <mar...@gmail.com> wrote:

On Fri, Sep 9, 2016 at 10:12 AM, Nicolai Hähnle <nhaeh...@gmail.com>
wrote:

From: Nicolai Hähnle <nicolai.haeh...@amd.com>

Not sure if it's possible to avoid programming the block size twice
(once for
the userdata and once for the dispatch).

Since the shaders are compiled with a pessimistic upper limit on the
number of
registers, asynchronously compiling variants may be worth
considering in the
future if we observe the shaders to be dispatched with small block
sizes.
---
I think this is sufficient to support variable group sizes on
radeonsi, but
it's completely untested. Do you keep the latest version of your
series in a
public repository somewhere?

 src/gallium/drivers/radeonsi/si_compute.c | 10 +++++++++-
 src/gallium/drivers/radeonsi/si_shader.c  | 29
++++++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_shader.h  |  4 +++-
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c
b/src/gallium/drivers/radeonsi/si_compute.c
index 5041761..26e096c 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -379,25 +379,33 @@ static void si_setup_tgsi_grid(struct
si_context *sctx,
                for (i = 0; i < 3; ++i) {
                        radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
                        radeon_emit(cs,
COPY_DATA_SRC_SEL(COPY_DATA_MEM) |

COPY_DATA_DST_SEL(COPY_DATA_REG));
                        radeon_emit(cs, (va +  4 * i));
                        radeon_emit(cs, (va + 4 * i) >> 32);
                        radeon_emit(cs, (grid_size_reg >> 2) + i);
                        radeon_emit(cs, 0);
                }
        } else {
+               struct si_compute *program =
sctx->cs_shader_state.program;
+               bool variable_group_size =
+
program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
== 0;

-               radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+               radeon_set_sh_reg_seq(cs, grid_size_reg,
variable_group_size ? 6 : 3);
                radeon_emit(cs, info->grid[0]);
                radeon_emit(cs, info->grid[1]);
                radeon_emit(cs, info->grid[2]);
+               if (variable_group_size) {
+                       radeon_emit(cs, info->block[0]);
+                       radeon_emit(cs, info->block[1]);
+                       radeon_emit(cs, info->block[2]);
+               }
        }
 }

 static void si_emit_dispatch_packets(struct si_context *sctx,
                                      const struct pipe_grid_info
*info)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        bool render_cond_bit = sctx->b.render_cond &&
!sctx->b.render_cond_force_off;
        unsigned waves_per_threadgroup =
                DIV_ROUND_UP(info->block[0] * info->block[1] *
info->block[2], 64);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c
b/src/gallium/drivers/radeonsi/si_shader.c
index 0b7de18..730ee21 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1783,30 +1783,35 @@ static void declare_system_value(

        case TGSI_SEMANTIC_GRID_SIZE:
                value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_GRID_SIZE);
                break;

        case TGSI_SEMANTIC_BLOCK_SIZE:
        {
                LLVMValueRef values[3];
                unsigned i;
                unsigned *properties =
ctx->shader->selector->info.properties;
-               unsigned sizes[3] = {
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
-               };

-               for (i = 0; i < 3; ++i)
-                       values[i] = lp_build_const_int32(gallivm,
sizes[i]);
+               if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]
!= 0) {
+                       unsigned sizes[3] = {
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
+                       };
+
+                       for (i = 0; i < 3; ++i)
+                               values[i] =
lp_build_const_int32(gallivm, sizes[i]);

-               value = lp_build_gather_values(gallivm, values, 3);
+                       value = lp_build_gather_values(gallivm,
values, 3);
+               } else {
+                       value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_BLOCK_SIZE);
+               }
                break;
        }

        case TGSI_SEMANTIC_BLOCK_ID:
                value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_BLOCK_ID);
                break;

        case TGSI_SEMANTIC_THREAD_ID:
                value = LLVMGetParam(radeon_bld->main_fn,
SI_PARAM_THREAD_ID);
                break;
@@ -5705,20 +5710,21 @@ static void create_function(struct
si_shader_context *ctx)

                        for (i = 0; i < num_return_sgprs; i++)
                                returns[i] = ctx->i32;
                        for (; i < num_returns; i++)
                                returns[i] = ctx->f32;
                }
                break;

        case PIPE_SHADER_COMPUTE:
                params[SI_PARAM_GRID_SIZE] = v3i32;
+               params[SI_PARAM_BLOCK_SIZE] = v3i32;
                params[SI_PARAM_BLOCK_ID] = v3i32;
                last_sgpr = SI_PARAM_BLOCK_ID;

                params[SI_PARAM_THREAD_ID] = v3i32;
                num_params = SI_PARAM_THREAD_ID + 1;
                break;
        default:
                assert(0 && "unimplemented shader");
                return;
        }
@@ -5741,21 +5747,26 @@ static void create_function(struct
si_shader_context *ctx)

S_0286D0_LINEAR_CENTROID_ENA(1) |
                                          S_0286D0_FRONT_FACE_ENA(1) |

S_0286D0_POS_FIXED_PT_ENA(1));
        } else if (ctx->type == PIPE_SHADER_COMPUTE) {
                const unsigned *properties =
shader->selector->info.properties;
                unsigned max_work_group_size =

properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *

properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *

properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];

-               assert(max_work_group_size);
+               if (!max_work_group_size) {
+                       /* This is a variable group size compute
shader,
+                        * compile it for the maximum possible group
size.
+                        */
+                       max_work_group_size = 2048;
+               }


The VGPR limit is 32. That's too limiting. I'd prefer it if
GL_MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB was set to 512 or
1024. That should move the limit to 128 or 64, respectively.


FWIW I'd definitely like to see that for nouveau eventually as well
(basically have the same issue, where we have to limit the # of regs
used artificially). But perhaps the initial iteration can just work
with the same maxes, and then be improved as a follow-on?


What about adding a new cap for MaxComputeVariableGroupSizeARB and
enable the ext based on that cap (suggested by Marek and I do agree) ?


Yes, please. That would solve all the issues nicely.

Nicolai


  -ilia
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radeonsi: support TGSI compute shaders with variable block size

Reply via email to