On 01/11/2018 04:03 AM, Dave Airlie wrote:
From: Dave Airlie <airl...@redhat.com>

Instead of putting the push constants into the upload buffer,
if we have space in the sgprs we can upload the per-stage
constants into the shaders directly.

This saves a few reads from memory in the meta shaders,
we should also be able to inline other objects like
descriptors.

Signed-off-by: Dave Airlie <airl...@redhat.com>
---
  src/amd/common/ac_nir_to_llvm.c  | 93 ++++++++++++++++++++++++++++++++++++----
  src/amd/common/ac_nir_to_llvm.h  |  4 ++
  src/amd/common/ac_shader_info.c  |  5 ++-
  src/amd/common/ac_shader_info.h  |  1 +
  src/amd/vulkan/radv_cmd_buffer.c | 74 ++++++++++++++++++++++++--------
  5 files changed, 150 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index c00220a9c3..818ce40168 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -92,6 +92,7 @@ struct nir_to_llvm_context {
        LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
        LLVMValueRef ring_offsets;
        LLVMValueRef push_constants;
+       LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
        LLVMValueRef view_index;
        LLVMValueRef num_work_groups;
        LLVMValueRef workgroup_ids[3];
@@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
        LLVMSetFunctionCallConv(func, calling_conv);
  }
-#define MAX_ARGS 23
+#define MAX_ARGS 32
  struct arg_info {
        LLVMTypeRef types[MAX_ARGS];
        LLVMValueRef *assign[MAX_ARGS];
@@ -538,6 +539,8 @@ struct user_sgpr_info {
        bool need_ring_offsets;
        uint8_t sgpr_count;
        bool indirect_all_descriptor_sets;
+       uint8_t base_inline_push_consts;
+       uint8_t num_inline_push_consts;
  };
static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
@@ -609,8 +612,45 @@ static void allocate_user_sgprs(struct nir_to_llvm_context 
*ctx,
        } else {
                user_sgpr_info->sgpr_count += 
util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
        }
+
+       if (ctx->shader_info->info.loads_push_constants) {
+               uint32_t remaining_sgprs = 16 - user_sgpr_info->sgpr_count;

This can be 32 on GFX9. Also, please have a look at the TODO above, the number of user SGPRs has to be fixed first, should be quite trivial but I didn't send a patch yet.

+               if (!ctx->shader_info->info.has_indirect_push_constants &&
+                   !ctx->shader_info->info.loads_dynamic_offsets)
+                       remaining_sgprs += 2;
+
+               if (ctx->options->layout->push_constant_size) {
+                       uint8_t num_32bit_push_consts = 
(ctx->shader_info->info.max_push_constant_used -
+                                                        
ctx->shader_info->info.min_push_constant_used) / 4;
+                       user_sgpr_info->base_inline_push_consts = 
ctx->shader_info->info.min_push_constant_used / 4;
+
+                       if (num_32bit_push_consts < remaining_sgprs) {
+                               user_sgpr_info->num_inline_push_consts = 
num_32bit_push_consts;
+                               if 
(!ctx->shader_info->info.has_indirect_push_constants)
+                                       
ctx->shader_info->info.loads_push_constants = false;
+                       } else {
+                               user_sgpr_info->num_inline_push_consts = 
remaining_sgprs;
+                       }
+
+                       if (user_sgpr_info->num_inline_push_consts > 
AC_UD_MAX_INLINE_PUSH_CONST)
+                               user_sgpr_info->num_inline_push_consts = 
AC_UD_MAX_INLINE_PUSH_CONST;
+               }
+       }
  }
+static void
+declare_inline_push_consts(struct nir_to_llvm_context *ctx,
+                          gl_shader_stage stage,
+                          const struct user_sgpr_info *user_sgpr_info,
+                          struct arg_info *args)
+{
+       ctx->shader_info->inline_push_const_mask = (1 << 
user_sgpr_info->num_inline_push_consts) - 1;
+       ctx->shader_info->inline_push_const_mask <<= 
user_sgpr_info->base_inline_push_consts;
+
+       for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+               add_arg(args, ARG_SGPR, ctx->ac.i32, 
&ctx->inline_push_consts[i]);
+
+}
  static void
  declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
                           gl_shader_stage stage,
@@ -644,6 +684,9 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
                /* 1 for push constants and dynamic descriptors */
                add_array_arg(args, type, &ctx->push_constants);
        }
+
+       if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && 
previous_stage == MESA_SHADER_VERTEX)))
+               declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
  }
static void
@@ -651,6 +694,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context 
*ctx,
                                gl_shader_stage stage,
                                bool has_previous_stage,
                                gl_shader_stage previous_stage,
+                               const struct user_sgpr_info *user_sgpr_info,
                                struct arg_info *args)
  {
        if (!ctx->is_gs_copy_shader &&
@@ -660,6 +704,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context 
*ctx,
                        add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
                                &ctx->vertex_buffers);
                }
+               declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
                add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
                add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
                if (ctx->shader_info->info.vs.needs_draw_id) {
@@ -693,6 +738,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, 
struct arg_info *args)
        add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
  }
+static void
+set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
+                         const struct user_sgpr_info *user_sgpr_info,
+                         uint8_t *user_sgpr_idx)
+{
+       ctx->shader_info->user_sgprs_locs.push_const_base = 
user_sgpr_info->base_inline_push_consts;
+       for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+               
set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], 
user_sgpr_idx, 1, 0);
+}
+
  static void
  set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
                      bool has_previous_stage, gl_shader_stage previous_stage,
@@ -734,12 +789,17 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, 
gl_shader_stage stage,
        if (ctx->shader_info->info.loads_push_constants) {
                set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
        }
+
+
+       if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && 
previous_stage == MESA_SHADER_VERTEX)))
+               set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
  }
static void
  set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
                           gl_shader_stage stage, bool has_previous_stage,
                           gl_shader_stage previous_stage,
+                          const struct user_sgpr_info *user_sgpr_info,
                           uint8_t *user_sgpr_idx)
  {
        if (!ctx->is_gs_copy_shader &&
@@ -750,6 +810,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
                                       user_sgpr_idx, 2);
                }
+ set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
                unsigned vs_num = 2;
                if (ctx->shader_info->info.vs.needs_draw_id)
                        vs_num++;
@@ -805,7 +866,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
                                           previous_stage, &user_sgpr_info,
                                           &args, &desc_sets);
                declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
-                                               previous_stage, &args);
+                                               previous_stage, &user_sgpr_info, 
&args);
if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
                        add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
@@ -838,7 +899,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
                                                   &desc_sets);
                        declare_vs_specific_input_sgprs(ctx, stage,
                                                        has_previous_stage,
-                                                       previous_stage, &args);
+                                                       previous_stage, 
&user_sgpr_info, &args);
add_arg(&args, ARG_SGPR, ctx->ac.i32,
                                &ctx->ls_out_layout);
@@ -934,7 +995,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
                        } else {
                                declare_vs_specific_input_sgprs(ctx, stage,
                                                                
has_previous_stage,
-                                                               previous_stage,
+                                                               previous_stage, 
&user_sgpr_info,
                                                                &args);
                        }
@@ -1076,7 +1137,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
                break;
        case MESA_SHADER_VERTEX:
                set_vs_specific_input_locs(ctx, stage, has_previous_stage,
-                                          previous_stage, &user_sgpr_idx);
+                                          previous_stage, &user_sgpr_info, 
&user_sgpr_idx);
                if (ctx->view_index)
                        set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 
1);
                if (ctx->options->key.vs.as_ls) {
@@ -1088,7 +1149,7 @@ static void create_function(struct nir_to_llvm_context 
*ctx,
                break;
        case MESA_SHADER_TESS_CTRL:
                set_vs_specific_input_locs(ctx, stage, has_previous_stage,
-                                          previous_stage, &user_sgpr_idx);
+                                          previous_stage, &user_sgpr_info, 
&user_sgpr_idx);
                if (has_previous_stage)
                        set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
                                       &user_sgpr_idx, 1);
@@ -1108,6 +1169,7 @@ static void create_function(struct nir_to_llvm_context 
*ctx,
                                set_vs_specific_input_locs(ctx, stage,
                                                           has_previous_stage,
                                                           previous_stage,
+                                                          &user_sgpr_info,
                                                           &user_sgpr_idx);
                        else
                                set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
@@ -2357,9 +2419,24 @@ static LLVMValueRef visit_load_push_constant(struct 
nir_to_llvm_context *ctx,
                                               nir_intrinsic_instr *instr)
  {
        LLVMValueRef ptr, addr;
+       LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
+       unsigned index = nir_intrinsic_base(instr);
+
+       if (LLVMIsConstant(src0)) {
+               unsigned array_index = index;
+               array_index += LLVMConstIntGetZExtValue(src0);
+               array_index /= 4;
+
+               uint32_t bits = ((1 << instr->num_components) - 1) << 
array_index;
+
+               if ((bits & ctx->shader_info->inline_push_const_mask) == bits) {
+                       array_index -= 
ctx->shader_info->user_sgprs_locs.push_const_base;
+                       return ac_build_gather_values(&ctx->ac, 
&ctx->inline_push_consts[array_index], instr->num_components);
+               }
+       }
- addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
-       addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), 
"");
+       addr = LLVMConstInt(ctx->ac.i32, index, 0);
+       addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
        ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index b3ad0a0985..9f9230d3e6 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -127,10 +127,13 @@ enum ac_ud_index {
// Match MAX_SETS from radv_descriptor_set.h
  #define AC_UD_MAX_SETS MAX_SETS
+#define AC_UD_MAX_INLINE_PUSH_CONST 8
struct ac_userdata_locations {
        struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
        struct ac_userdata_info shader_data[AC_UD_MAX_UD];
+       struct ac_userdata_info inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
+       uint8_t push_const_base;
  };
struct ac_vs_output_info {
@@ -156,6 +159,7 @@ struct ac_shader_variant_info {
        unsigned num_user_sgprs;
        unsigned num_input_sgprs;
        unsigned num_input_vgprs;
+       uint32_t inline_push_const_mask;
        bool need_indirect_descriptor_sets;
        struct {
                struct {
diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
index 18fa9e1c94..fbb46684ae 100644
--- a/src/amd/common/ac_shader_info.c
+++ b/src/amd/common/ac_shader_info.c
@@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
  {
        struct nir_function *func = (struct nir_function 
*)exec_list_get_head(&nir->functions);
-
-       if (options->layout->dynamic_offset_count)
+       if (options->layout->dynamic_offset_count) {
                info->loads_push_constants = true;
+               info->loads_dynamic_offsets = true;
+       }
nir_foreach_variable(variable, &nir->inputs)
                gather_info_input_decl(nir, options, variable, info);
diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
index e35cde0ca9..e8ea33f2e3 100644
--- a/src/amd/common/ac_shader_info.h
+++ b/src/amd/common/ac_shader_info.h
@@ -32,6 +32,7 @@ struct ac_shader_info {
        uint8_t min_push_constant_used;
        uint8_t max_push_constant_used;
        bool has_indirect_push_constants;
+       bool loads_dynamic_offsets;
        bool loads_push_constants;
        bool needs_multiview_view_index;
        bool uses_invocation_id;
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 60f19fb12b..17306eeaf8 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer 
*cmd_buffer,
        assert(cmd_buffer->cs->cdw <= cdw_max);
  }
+static struct ac_userdata_info *
+radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
+                           int idx)
+{
+       idx -= shader->info.user_sgprs_locs.push_const_base;
+       return &shader->info.user_sgprs_locs.inline_push_consts[idx];
+}
+
+static void
+radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
+                           struct radv_shader_variant *shader,
+                           unsigned base_reg,
+                           int idx, int count, uint32_t *values)
+{
+       struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, idx);
+       assert (loc->sgpr_idx == -1);
+       assert (!loc->indirect);
+       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 
count);
+       radeon_emit_array(cmd_buffer->cs, values, count);
+}
+
  static void
  radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
                     struct radv_pipeline *pipeline,
@@ -1816,36 +1837,55 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
        unsigned offset;
        void *ptr;
        uint64_t va;
+       bool need_push_constants = false;
stages &= cmd_buffer->push_constant_stages;
        if (!stages ||
            (!layout->push_constant_size && !layout->dynamic_offset_count))
                return;
- if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
-                                         16 * layout->dynamic_offset_count,
-                                         256, &offset, &ptr))
-               return;
+       radv_foreach_stage(stage, stages) {
+               if (!pipeline->shaders[stage])
+                       continue;
+
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_push_constants;
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
- memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
-       if (layout->dynamic_offset_count) {
-               memcpy((char*)ptr + layout->push_constant_size, 
cmd_buffer->dynamic_buffers,
-                      16 * layout->dynamic_offset_count);
+               uint32_t mask = 
pipeline->shaders[stage]->info.inline_push_const_mask;
+               uint32_t base_reg = pipeline->user_data_0[stage];
+               while (mask) {
+                       int start, count;
+                       u_bit_scan_consecutive_range(&mask, &start, &count);
+                       radv_emit_inline_pushconsts(cmd_buffer, 
pipeline->shaders[stage], base_reg,
+                                                   start, count, (uint32_t 
*)&cmd_buffer->push_constants[start * 4]);
+               }
        }
- va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
-       va += offset;
+       if (need_push_constants) {
+               if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 
layout->push_constant_size +
+                                                 16 * 
layout->dynamic_offset_count,
+                                                 256, &offset, &ptr))
+                       return;
- MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
-                                                          cmd_buffer->cs, 
MESA_SHADER_STAGES * 4);
+               memcpy(ptr, cmd_buffer->push_constants, 
layout->push_constant_size);
+               if (layout->dynamic_offset_count) {
+                       memcpy((char*)ptr + layout->push_constant_size, 
cmd_buffer->dynamic_buffers,
+                              16 * layout->dynamic_offset_count);
+               }
- radv_foreach_stage(stage, stages) {
-               if (pipeline->shaders[stage]) {
-                       radv_emit_userdata_address(cmd_buffer, pipeline, stage,
-                                                  AC_UD_PUSH_CONSTANTS, va);
+               va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+               va += offset;
+
+               MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws,
+                                                                  
cmd_buffer->cs, MESA_SHADER_STAGES * 4);
+
+               radv_foreach_stage(stage, stages) {
+                       if (pipeline->shaders[stage]) {
+                               radv_emit_userdata_address(cmd_buffer, 
pipeline, stage,
+                                                          
AC_UD_PUSH_CONSTANTS, va);
+                       }
                }
        }
-
        cmd_buffer->push_constant_stages &= ~stages;
        assert(cmd_buffer->cs->cdw <= cdw_max);
  }

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to