nir: do not always preload PS inputs at beginning

Marek Olšák Sun, 11 Mar 2018 08:08:15 -0700

On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset <samuel.pitoi...@gmail.com>
wrote:


> RadeonSI does something similar, the VGPRs decrease is a win
> but not sure if we really want to implement that.
>
> Polaris10:
> Totals from affected shaders:
> SGPRS: 116376 -> 116768 (0.34 %)
> VGPRS: 76556 -> 74868 (-2.20 %)
> Spilled SGPRs: 10347 -> 10466 (1.15 %)
> Code Size: 5555072 -> 5569024 (0.25 %) bytes
> Max Waves: 9854 -> 9951 (0.98 %)
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
> ---
>  src/amd/common/ac_nir_to_llvm.c | 118 ++++++++++++++++++++++++++++++
> +---------
>  src/amd/common/ac_shader_abi.h  |   7 +++
>  2 files changed, 98 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_
> llvm.c
> index 644c85e2eb..eb0935972d 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>                                    nir_intrinsic_instr *instr)
>  {
>         LLVMValueRef values[8];
> +       int location = instr->variables[0]->var->data.location;
>         int idx = instr->variables[0]->var->data.driver_location;
>         int ve = instr->dest.ssa.num_components;
>         unsigned comp = instr->variables[0]->var->data.location_frac;
> @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>
>  instr->num_components, vertex_index, const_index, type);
>                 }
>
> +               LLVMValueRef inputs[4];
> +
> +               if (ctx->stage == MESA_SHADER_FRAGMENT) {
> +                       ctx->abi->load_fs_inputs(ctx->abi, location,
> +                                                indir_index, const_index,
> +                                                stride, inputs);
>

load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't get
here?

Marek


> +               } else {
> +                       unsigned index = idx +
> +                               (indir_index ? 0 : const_index * stride);
> +
> +                       memcpy(inputs, &ctx->abi->inputs[index],
> sizeof(inputs));
> +               }
> +
>                 for (unsigned chan = comp; chan < ve + comp; chan++) {
>                         if (indir_index) {
>                                 unsigned count =
> glsl_count_attribute_slots(
> @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>                                                 ctx->stage ==
> MESA_SHADER_VERTEX);
>                                 count -= chan / 4;
>                                 LLVMValueRef tmp_vec =
> ac_build_gather_values_extended(
> -                                               &ctx->ac, ctx->abi->inputs
> + idx + chan, count,
> +                                               &ctx->ac, inputs + chan,
> count,
>                                                 stride, false, true);
>
>                                 values[chan] =
> LLVMBuildExtractElement(ctx->ac.builder,
>
>  tmp_vec,
>
>  indir_index, "");
> -                       } else
> -                               values[chan] = ctx->abi->inputs[idx + chan
> + const_index * stride];
> +                       } else {
> +                               values[chan] = inputs[chan];
> +                       }
>                 }
>                 break;
>         case nir_var_local:
> @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context
> *ctx,
>         }
>  }
>
> +static unsigned
> +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
> +{
> +       struct ac_shader_info *info = &ctx->shader_info->info;
> +       uint64_t mask = info->input_mask & ((1ull << idx) - 1);
> +
> +       mask &= ~(1ull << VARYING_SLOT_POS);
> +
> +       return util_bitcount64(mask);
> +}
> +
> +/* If this is true, preload FS inputs at the beginning of shaders.
> Otherwise,
> + * reload them at each use. This must be true if the shader is using
> + * derivatives and KILL, because KILL can leave the WQM and then a lazy
> + * input load isn't in the WQM anymore.
> + */
> +static bool
> +radv_preload_fs_inputs(struct radv_shader_context *ctx)
> +{
> +       return ctx->shader_info->info.ps.uses_derivatives &&
> +              ctx->shader_info->info.ps.uses_kill;
> +}
> +
>  static void
> -handle_fs_inputs(struct radv_shader_context *ctx,
> -                 struct nir_shader *nir)
> +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
> +                   LLVMValueRef out[4])
>  {
>         struct ac_shader_info *info = &ctx->shader_info->info;
>
> +       if (idx >= VARYING_SLOT_VAR0 ||
> +           idx == VARYING_SLOT_PNTC ||
> +           idx == VARYING_SLOT_PRIMITIVE_ID ||
> +           idx == VARYING_SLOT_LAYER) {
> +               unsigned interp_mode = info->ps.input_interp_mode[idx];
> +               unsigned interp_loc = info->ps.input_interp_loc[idx];
> +               unsigned hw_index = get_input_hw_index(ctx, idx);
> +               LLVMValueRef interp_param =
> +                       lookup_interp_param(&ctx->abi, interp_mode,
> interp_loc);
> +
> +               interp_fs_input(ctx, hw_index, interp_param,
> ctx->abi.prim_mask,
> +                               &out[0]);
> +       } else if (idx == VARYING_SLOT_POS) {
> +               for (int i = 0; i < 3; ++i)
> +                       out[i] = ctx->abi.frag_pos[i];
> +
> +               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
> +                                      ctx->abi.frag_pos[3]);
> +       }
> +}
> +
> +static void
> +load_fs_inputs(struct ac_shader_abi *abi,
> +              unsigned location,
> +              LLVMValueRef indir_index,
> +              unsigned const_index,
> +              unsigned stride,
> +              LLVMValueRef out[4])
> +{
> +       struct radv_shader_context *ctx = radv_shader_context_from_abi(
> abi);
> +
> +       if (!radv_preload_fs_inputs(ctx)) {
> +               radv_load_fs_inputs(ctx, location, out);
> +       } else {
> +               unsigned index = radeon_llvm_reg_index_soa(location, 0);
> +
> +               index += (indir_index ? 0 : const_index * stride);
> +
> +               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
> +       }
> +}
> +
> +static void
> +handle_fs_inputs(struct radv_shader_context *ctx,
> +                 struct nir_shader *nir)
> +{
>         prepare_interp_optimize(ctx, nir);
>
>         nir_foreach_variable(variable, &nir->inputs)
>                 handle_fs_input_decl(ctx, variable);
>
> -       unsigned index = 0;
> -
>         for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
> -               LLVMValueRef interp_param;
>                 LLVMValueRef *inputs = ctx->inputs
> +radeon_llvm_reg_index_soa(i, 0);
>
>                 if (!(ctx->shader_info->info.input_mask & (1ull << i)))
>                         continue;
>
> -               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
> -                   i == VARYING_SLOT_PRIMITIVE_ID || i ==
> VARYING_SLOT_LAYER) {
> -                       unsigned interp_mode =
> info->ps.input_interp_mode[i];
> -                       unsigned interp_loc = info->ps.input_interp_loc[i];
> -
> -                       interp_param = lookup_interp_param(&ctx->abi,
> interp_mode,
> -                                                          interp_loc);
> -
> -                       interp_fs_input(ctx, index, interp_param,
> ctx->abi.prim_mask,
> -                                       inputs);
> -
> -                       ++index;
> -               } else if (i == VARYING_SLOT_POS) {
> -                       for(int i = 0; i < 3; ++i)
> -                               inputs[i] = ctx->abi.frag_pos[i];
> -
> -                       inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
> -                                                 ctx->abi.frag_pos[3]);
> -               }
> +               radv_load_fs_inputs(ctx, i, inputs);
>         }
>
>         if (ctx->shader_info->info.needs_multiview_view_index)
> @@ -6924,6 +6987,7 @@ LLVMModuleRef 
> ac_translate_nir_to_llvm(LLVMTargetMachineRef
> tm,
>                         ctx.abi.load_base_vertex = radv_load_base_vertex;
>                 } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
> {
>                         shader_info->fs.can_discard =
> shaders[i]->info.fs.uses_discard;
> +                       ctx.abi.load_fs_inputs = load_fs_inputs;
>                         ctx.abi.lookup_interp_param = lookup_interp_param;
>                         ctx.abi.load_sample_position =
> load_sample_position;
>                         ctx.abi.load_sample_mask_in = load_sample_mask_in;
> diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_
> abi.h
> index 901e49b1f9..8e51ce9fdd 100644
> --- a/src/amd/common/ac_shader_abi.h
> +++ b/src/amd/common/ac_shader_abi.h
> @@ -97,6 +97,13 @@ struct ac_shader_abi {
>                                     unsigned const_index,
>                                     LLVMTypeRef type);
>
> +       void (*load_fs_inputs)(struct ac_shader_abi *abi,
> +                              unsigned location,
> +                              LLVMValueRef indir_index,
> +                              unsigned const_index,
> +                              unsigned stride,
> +                              LLVMValueRef out[4]);
> +
>         LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
>                                            LLVMTypeRef type,
>                                            LLVMValueRef vertex_index,
> --
> 2.16.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning

Reply via email to