On 2/5/19 11:29 AM, Bas Nieuwenhuizen wrote:
On Tue, Feb 5, 2019 at 11:07 AM Samuel Pitoiset
<samuel.pitoi...@gmail.com> wrote:

On 2/5/19 10:58 AM, Bas Nieuwenhuizen wrote:
On Fri, Jan 25, 2019 at 5:27 PM Samuel Pitoiset
<samuel.pitoi...@gmail.com> wrote:
This removes some scalar loads from shaders, but it increases
the number of SET_SH_REG packets. This is currently basic but
it could be improved if needed. Inlining dynamic offsets might
also help.

Original idea from Dave Airlie.

29164 shaders in 15096 tests
Totals:
SGPRS: 1336072 -> 1365241 (2.18 %)
VGPRS: 937784 -> 934592 (-0.34 %)
Spilled SGPRs: 24751 -> 24796 (0.18 %)
Code Size: 50001672 -> 49815524 (-0.37 %) bytes
Max Waves: 208755 -> 208830 (0.04 %)

Totals from affected shaders:
SGPRS: 295018 -> 324187 (9.89 %)
VGPRS: 243108 -> 239916 (-1.31 %)
Spilled SGPRs: 1464 -> 1509 (3.07 %)
Code Size: 8028188 -> 7842040 (-2.32 %) bytes
Max Waves: 69580 -> 69655 (0.11 %)

Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
---
   src/amd/common/ac_nir_to_llvm.c   | 23 +++++++--
   src/amd/common/ac_shader_abi.h    |  4 ++
   src/amd/vulkan/radv_cmd_buffer.c  | 78 ++++++++++++++++++++++---------
   src/amd/vulkan/radv_nir_to_llvm.c | 54 +++++++++++++++++++++
   src/amd/vulkan/radv_shader.h      | 10 ++--
   src/amd/vulkan/radv_shader_info.c |  4 ++
   6 files changed, 145 insertions(+), 28 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f509fc31dff..db1574b5b35 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1392,10 +1392,27 @@ static LLVMValueRef visit_load_push_constant(struct 
ac_nir_context *ctx,
                                                nir_intrinsic_instr *instr)
   {
          LLVMValueRef ptr, addr;
+       LLVMValueRef src0 = get_src(ctx, instr->src[0]);
+       unsigned index = nir_intrinsic_base(instr);

-       addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
-       addr = LLVMBuildAdd(ctx->ac.builder, addr,
-                           get_src(ctx, instr->src[0]), "");
+       addr = LLVMConstInt(ctx->ac.i32, index, 0);
+       addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
+
+       /* Load constant values from user SGPRS when possible, otherwise
+        * fallback to the default path that loads directly from memory.
+        */
+       if (LLVMIsConstant(src0) &&
+           index == 0 &&
+           instr->dest.ssa.bit_size == 32) {
+               unsigned offset = LLVMConstIntGetZExtValue(src0) / 4;
+               unsigned count = instr->dest.ssa.num_components;
+
+               if (offset + count <= ctx->abi->num_inline_push_consts) {
+                       return ac_build_gather_values(&ctx->ac,
+                                                     
ctx->abi->inline_push_consts + offset,
+                                                     count);
+               }
+       }

          ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);

diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..704c3d107c2 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -32,6 +32,8 @@ struct nir_variable;

   #define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)

+#define AC_MAX_INLINE_PUSH_CONSTS 8
+
   enum ac_descriptor_type {
          AC_DESC_IMAGE,
          AC_DESC_FMASK,
@@ -66,6 +68,8 @@ struct ac_shader_abi {

          /* Vulkan only */
          LLVMValueRef push_constants;
+       LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
+       unsigned num_inline_push_consts;
          LLVMValueRef view_index;

          LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index aae90290841..f80e2078da0 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -627,6 +627,23 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
*cmd_buffer,
          }
   }

+static void
+radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
+                            struct radv_pipeline *pipeline,
+                            gl_shader_stage stage,
+                            int idx, int count, uint32_t *values)
+{
+       struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, 
idx);
+       uint32_t base_reg = pipeline->user_data_0[stage];
+       if (loc->sgpr_idx == -1)
+               return;
+
+       assert(loc->num_sgprs == count);
+
+       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 
count);
+       radeon_emit_array(cmd_buffer->cs, values, count);
+}
+
   static void
   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
                                struct radv_pipeline *pipeline)
@@ -1900,6 +1917,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
                  radv_get_descriptors_state(cmd_buffer, bind_point);
          struct radv_pipeline_layout *layout = pipeline->layout;
          struct radv_shader_variant *shader, *prev_shader;
+       bool need_push_constants = false;
          unsigned offset;
          void *ptr;
          uint64_t va;
@@ -1909,37 +1927,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
              (!layout->push_constant_size && !layout->dynamic_offset_count))
                  return;

-       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 
layout->push_constant_size +
-                                         16 * layout->dynamic_offset_count,
-                                         256, &offset, &ptr))
-               return;
+       radv_foreach_stage(stage, stages) {
+               if (!pipeline->shaders[stage])
+                       continue;

-       memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
-       memcpy((char*)ptr + layout->push_constant_size,
-              descriptors_state->dynamic_buffers,
-              16 * layout->dynamic_offset_count);
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_push_constants;
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_dynamic_offsets;

-       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
-       va += offset;
+               uint8_t count = 
pipeline->shaders[stage]->info.info.num_inline_push_consts;

-       MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws,
-                                                          cmd_buffer->cs, 
MESA_SHADER_STAGES * 4);
+               radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
+                                            AC_UD_INLINE_PUSH_CONSTANTS,
+                                            count,
+                                            (uint32_t 
*)&cmd_buffer->push_constants[0]);
+       }

-       prev_shader = NULL;
-       radv_foreach_stage(stage, stages) {
-               shader = radv_get_shader(pipeline, stage);
+       if (need_push_constants) {
+               if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 
layout->push_constant_size +
+                                                 16 * 
layout->dynamic_offset_count,
+                                                 256, &offset, &ptr))
+                       return;
+
+               memcpy(ptr, cmd_buffer->push_constants, 
layout->push_constant_size);
+               memcpy((char*)ptr + layout->push_constant_size,
+                      descriptors_state->dynamic_buffers,
+                      16 * layout->dynamic_offset_count);
+
+               va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+               va += offset;
+
+               MAYBE_UNUSED unsigned cdw_max =
+                       radeon_check_space(cmd_buffer->device->ws,
+                                          cmd_buffer->cs, MESA_SHADER_STAGES * 
4);
+
+               prev_shader = NULL;
+               radv_foreach_stage(stage, stages) {
+                       shader = radv_get_shader(pipeline, stage);

-               /* Avoid redundantly emitting the address for merged stages. */
-               if (shader && shader != prev_shader) {
-                       radv_emit_userdata_address(cmd_buffer, pipeline, stage,
-                                                  AC_UD_PUSH_CONSTANTS, va);
+                       /* Avoid redundantly emitting the address for merged 
stages. */
+                       if (shader && shader != prev_shader) {
+                               radv_emit_userdata_address(cmd_buffer, 
pipeline, stage,
+                                                          
AC_UD_PUSH_CONSTANTS, va);

-                       prev_shader = shader;
+                               prev_shader = shader;
+                       }
                  }
+               assert(cmd_buffer->cs->cdw <= cdw_max);
          }

          cmd_buffer->push_constant_stages &= ~stages;
-       assert(cmd_buffer->cs->cdw <= cdw_max);
   }

   static void
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 11417c5991b..1cffa748d15 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -627,6 +627,47 @@ count_vs_user_sgprs(struct radv_shader_context *ctx)
          return count;
   }

+static void allocate_inline_push_consts(struct radv_shader_context *ctx,
+                                       struct user_sgpr_info *user_sgpr_info)
+{
+       uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
+
+       /* Only supported if shaders don't have indirect push constants. */
+       if (ctx->shader_info->info.has_indirect_push_constants)
+               return;
+
+       /* Only supported for 32-bit push constants. */
+       if (!ctx->shader_info->info.has_32bit_push_constants)
+               return;
+
+       /* Only supported if base starts from 0. */
+       if (ctx->shader_info->info.min_push_constant_used != 0)
+               return;
+
+       uint8_t num_push_consts =
+               (ctx->shader_info->info.max_push_constant_used -
+                ctx->shader_info->info.min_push_constant_used) / 4;
I think you need +1? if min_push_constant_used ==
max_push_constant_used you still use 1 push constant. (The code in the
info pass adds range, but that does not include the size of the loaded
thing?)
No, as min is 'base' and max is 'base + range'.

Now that I think about it, the shader info pass does not take into
account the size of the variable at all right? What happens with a
vec4? The range does not include the object itself right?
If it's a vec4 starting from base=0, range should be 16, that works for
vec4.
hmm, so it includes the size of type. If I read the spir-v parser
correctly we set range very large though (looks like range is always
the total size of all push constants?)

Yes, it is. This implementation doesn't yet support holes. If the range is too large, the driver won't inline.

I assume this is something that can be improved.


+
+       /* Check if the number of user SGPRs is large enough. */
+       if (num_push_consts < remaining_sgprs) {
+               ctx->shader_info->info.num_inline_push_consts = num_push_consts;
+       } else {
+               ctx->shader_info->info.num_inline_push_consts = remaining_sgprs;
+       }
+
+       /* Clamp to the maximum number of allowed inlined push constants. */
+       if (ctx->shader_info->info.num_inline_push_consts > 
AC_MAX_INLINE_PUSH_CONSTS)
+               ctx->shader_info->info.num_inline_push_consts = 
AC_MAX_INLINE_PUSH_CONSTS;
+
+       if (ctx->shader_info->info.num_inline_push_consts == num_push_consts &&
+           !ctx->shader_info->info.loads_dynamic_offsets) {
This won't work, as we can reach here even if there are some accesses
which don't use inline push constants, e.g. some non 32-bit accesses.
Good catch! One possible fix is to change 'has_32bit_push_constants' to
'has_only_32bit_push_constants' and bail out if it's false.

What do you think?
That works.

+               /* Disable the default push constants path if all constants are
+                * inlined and if shaders don't use dynamic descriptors.
+                */
+               ctx->shader_info->info.loads_push_constants = false;
+       }
+}
+
   static void allocate_user_sgprs(struct radv_shader_context *ctx,
                                  gl_shader_stage stage,
                                  bool has_previous_stage,
@@ -706,6 +747,8 @@ static void allocate_user_sgprs(struct radv_shader_context 
*ctx,
          } else {
                  user_sgpr_info->remaining_sgprs = remaining_sgprs - 
num_desc_set;
          }
+
+       allocate_inline_push_consts(ctx, user_sgpr_info);
   }

   static void
@@ -735,6 +778,12 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
                  add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
          }

+       for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; 
i++) {
+               add_arg(args, ARG_SGPR, ctx->ac.i32,
+                       &ctx->abi.inline_push_consts[i]);
+       }
+       ctx->abi.num_inline_push_consts = 
ctx->shader_info->info.num_inline_push_consts;
+
          if (ctx->shader_info->info.so.num_outputs) {
                  add_arg(args, ARG_SGPR,
                          ac_array_in_const32_addr_space(ctx->ac.v4i32),
@@ -853,6 +902,11 @@ set_global_input_locs(struct radv_shader_context *ctx,
                  set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
          }

+       if (ctx->shader_info->info.num_inline_push_consts) {
+               set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
+                              ctx->shader_info->info.num_inline_push_consts);
+       }
+
          if (ctx->streamout_buffers) {
                  set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
                                 user_sgpr_idx);
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 09bc5c2d4a9..d1bc467c5d9 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -129,10 +129,11 @@ struct radv_nir_compiler_options {
   enum radv_ud_index {
          AC_UD_SCRATCH_RING_OFFSETS = 0,
          AC_UD_PUSH_CONSTANTS = 1,
-       AC_UD_INDIRECT_DESCRIPTOR_SETS = 2,
-       AC_UD_VIEW_INDEX = 3,
-       AC_UD_STREAMOUT_BUFFERS = 4,
-       AC_UD_SHADER_START = 5,
+       AC_UD_INLINE_PUSH_CONSTANTS = 2,
+       AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
+       AC_UD_VIEW_INDEX = 4,
+       AC_UD_STREAMOUT_BUFFERS = 5,
+       AC_UD_SHADER_START = 6,
          AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
          AC_UD_VS_BASE_VERTEX_START_INSTANCE,
          AC_UD_VS_MAX_UD,
@@ -167,6 +168,7 @@ struct radv_shader_info {
          uint8_t max_push_constant_used;
          bool has_32bit_push_constants;
          bool has_indirect_push_constants;
+       uint8_t num_inline_push_consts;
          uint32_t desc_set_used_mask;
          bool needs_multiview_view_index;
          bool uses_invocation_id;
diff --git a/src/amd/vulkan/radv_shader_info.c 
b/src/amd/vulkan/radv_shader_info.c
index cabe2a470a0..900bca1d097 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -211,6 +211,10 @@ gather_push_constant_info(const nir_shader *nir,
          if (base < info->min_push_constant_used)
                  info->min_push_constant_used = base;

+       /* TODO: handle base not 0. */
+       if (base != 0)
+               info->min_push_constant_used = -1;
+
          info->loads_push_constants = true;
   }

--
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to