Looks like it's working fine here now. One comment inline below. On 12 January 2018 at 02:43, Dave Airlie <airl...@gmail.com> wrote:
> From: Dave Airlie <airl...@redhat.com> > > Instead of putting the push constants into the upload buffer, > if we have space in the sgprs we can upload the per-stage > constants into the shaders directly. > > This saves a few reads from memory in the meta shaders, > we should also be able to inline other objects like > descriptors. > > v2: fixup 16->available_sgprs (Samuel) > fixup dynamic offsets. (Alex) > bump to 12. > handle push consts > 32 better, avoid F1 2017 crash > > TODO: proper vega support (Samuel) > > Signed-off-by: Dave Airlie <airl...@redhat.com> > --- > src/amd/common/ac_nir_to_llvm.c | 102 ++++++++++++++++++++++++++++++ > +++++---- > src/amd/common/ac_nir_to_llvm.h | 5 ++ > src/amd/common/ac_shader_info.c | 5 +- > src/amd/common/ac_shader_info.h | 1 + > src/amd/vulkan/radv_cmd_buffer.c | 75 +++++++++++++++++++++------- > 5 files changed, 159 insertions(+), 29 deletions(-) > > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_ > llvm.c > index 6c578de3aca..00ad76a82f7 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -92,6 +92,7 @@ struct nir_to_llvm_context { > LLVMValueRef descriptor_sets[AC_UD_MAX_SETS]; > LLVMValueRef ring_offsets; > LLVMValueRef push_constants; > + LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST]; > LLVMValueRef view_index; > LLVMValueRef num_work_groups; > LLVMValueRef workgroup_ids[3]; > @@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef > func, > LLVMSetFunctionCallConv(func, calling_conv); > } > > -#define MAX_ARGS 23 > +#define MAX_ARGS 32 > struct arg_info { > LLVMTypeRef types[MAX_ARGS]; > LLVMValueRef *assign[MAX_ARGS]; > @@ -538,6 +539,8 @@ struct user_sgpr_info { > bool need_ring_offsets; > uint8_t sgpr_count; > bool indirect_all_descriptor_sets; > + uint8_t base_inline_push_consts; > + uint8_t num_inline_push_consts; > }; > > static void allocate_user_sgprs(struct nir_to_llvm_context *ctx, > @@ -609,8 +612,49 @@ static void allocate_user_sgprs(struct > nir_to_llvm_context *ctx, > } else { > user_sgpr_info->sgpr_count += > util_bitcount(ctx->shader_info->info.desc_set_used_mask) > * 2; > } > + > + if (ctx->shader_info->info.loads_push_constants) { > + uint32_t remaining_sgprs = available_sgprs - > user_sgpr_info->sgpr_count; > + if (!ctx->shader_info->info.has_indirect_push_constants && > + !ctx->shader_info->info.loads_dynamic_offsets) > + remaining_sgprs += 2; > + > + if (ctx->options->layout->push_constant_size) { > + uint8_t num_32bit_push_consts = > (ctx->shader_info->info.max_push_constant_used - > + > ctx->shader_info->info.min_push_constant_used) / 4; > + > + if ((ctx->shader_info->info.min_push_constant_used > / 4) <= 63 && > + (ctx->shader_info->info.max_push_constant_used > / 4) <= 63) { > + user_sgpr_info->base_inline_push_consts = > ctx->shader_info->info.min_push_constant_used / 4; > + > + if (num_32bit_push_consts < > remaining_sgprs) { > + user_sgpr_info->num_inline_push_consts > = num_32bit_push_consts; > + if (!ctx->shader_info->info.has_ > indirect_push_constants) > + > ctx->shader_info->info.loads_push_constants = false; > + } else { > + user_sgpr_info->num_inline_push_consts > = remaining_sgprs; > + } > + > + if (user_sgpr_info->num_inline_push_consts > > AC_UD_MAX_INLINE_PUSH_CONST) > + user_sgpr_info->num_inline_push_consts > = AC_UD_MAX_INLINE_PUSH_CONST; > This is done after possibly setting loads_push_constants to false, if this happens shouldn't that still be true? With that fixed, for the series: Reviewed-by: Alex Smith <asm...@feralinteractive.com> Thanks, Alex > + } > + } > + } > } > > +static void > +declare_inline_push_consts(struct nir_to_llvm_context *ctx, > + gl_shader_stage stage, > + const struct user_sgpr_info *user_sgpr_info, > + struct arg_info *args) > +{ > + ctx->shader_info->inline_push_const_mask = (1ULL << > user_sgpr_info->num_inline_push_consts) - 1; > + ctx->shader_info->inline_push_const_base = > user_sgpr_info->base_inline_push_consts; > + > + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; > i++) > + add_arg(args, ARG_SGPR, ctx->ac.i32, > &ctx->inline_push_consts[i]); > + > +} > static void > declare_global_input_sgprs(struct nir_to_llvm_context *ctx, > gl_shader_stage stage, > @@ -640,10 +684,14 @@ declare_global_input_sgprs(struct > nir_to_llvm_context *ctx, > add_array_arg(args, const_array(type, 32), desc_sets); > } > > - if (ctx->shader_info->info.loads_push_constants) { > + if (ctx->shader_info->info.loads_push_constants || > + ctx->shader_info->info.loads_dynamic_offsets) { > /* 1 for push constants and dynamic descriptors */ > add_array_arg(args, type, &ctx->push_constants); > } > + > + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && > previous_stage == MESA_SHADER_VERTEX))) > + declare_inline_push_consts(ctx, stage, user_sgpr_info, > args); > } > > static void > @@ -651,6 +699,7 @@ declare_vs_specific_input_sgprs(struct > nir_to_llvm_context *ctx, > gl_shader_stage stage, > bool has_previous_stage, > gl_shader_stage previous_stage, > + const struct user_sgpr_info > *user_sgpr_info, > struct arg_info *args) > { > if (!ctx->is_gs_copy_shader && > @@ -660,6 +709,7 @@ declare_vs_specific_input_sgprs(struct > nir_to_llvm_context *ctx, > add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, > 16), > &ctx->vertex_buffers); > } > + declare_inline_push_consts(ctx, stage, user_sgpr_info, > args); > add_arg(args, ARG_SGPR, ctx->ac.i32, > &ctx->abi.base_vertex); > add_arg(args, ARG_SGPR, ctx->ac.i32, > &ctx->abi.start_instance); > if (ctx->shader_info->info.vs.needs_draw_id) { > @@ -693,6 +743,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context > *ctx, struct arg_info *args) > add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id); > } > > +static void > +set_inline_pushconst_locs(struct nir_to_llvm_context *ctx, > + const struct user_sgpr_info *user_sgpr_info, > + uint8_t *user_sgpr_idx) > +{ > + ctx->shader_info->user_sgprs_locs.push_const_base = > user_sgpr_info->base_inline_push_consts; > + for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; > i++) > + > set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], > user_sgpr_idx, 1, 0); > +} > + > static void > set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage > stage, > bool has_previous_stage, gl_shader_stage > previous_stage, > @@ -731,15 +791,21 @@ set_global_input_locs(struct nir_to_llvm_context > *ctx, gl_shader_stage stage, > ctx->shader_info->need_indirect_descriptor_sets = true; > } > > - if (ctx->shader_info->info.loads_push_constants) { > + if (ctx->shader_info->info.loads_push_constants || > + ctx->shader_info->info.loads_dynamic_offsets) { > set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, > 2); > } > + > + > + if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && > previous_stage == MESA_SHADER_VERTEX))) > + set_inline_pushconst_locs(ctx, user_sgpr_info, > user_sgpr_idx); > } > > static void > set_vs_specific_input_locs(struct nir_to_llvm_context *ctx, > gl_shader_stage stage, bool has_previous_stage, > gl_shader_stage previous_stage, > + const struct user_sgpr_info *user_sgpr_info, > uint8_t *user_sgpr_idx) > { > if (!ctx->is_gs_copy_shader && > @@ -750,6 +816,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context > *ctx, > user_sgpr_idx, 2); > } > > + set_inline_pushconst_locs(ctx, user_sgpr_info, > user_sgpr_idx); > unsigned vs_num = 2; > if (ctx->shader_info->info.vs.needs_draw_id) > vs_num++; > @@ -805,7 +872,7 @@ static void create_function(struct nir_to_llvm_context > *ctx, > previous_stage, &user_sgpr_info, > &args, &desc_sets); > declare_vs_specific_input_sgprs(ctx, stage, > has_previous_stage, > - previous_stage, &args); > + previous_stage, > &user_sgpr_info, &args); > > if (ctx->shader_info->info.needs_multiview_view_index || > (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && > ctx->options->key.has_multiview_view_index)) > add_arg(&args, ARG_SGPR, ctx->ac.i32, > &ctx->view_index); > @@ -838,7 +905,7 @@ static void create_function(struct nir_to_llvm_context > *ctx, > &desc_sets); > declare_vs_specific_input_sgprs(ctx, stage, > has_previous_stage, > - previous_stage, > &args); > + previous_stage, > &user_sgpr_info, &args); > > add_arg(&args, ARG_SGPR, ctx->ac.i32, > &ctx->ls_out_layout); > @@ -934,7 +1001,7 @@ static void create_function(struct > nir_to_llvm_context *ctx, > } else { > declare_vs_specific_input_sgprs(ctx, > stage, > > has_previous_stage, > - > previous_stage, > + > previous_stage, &user_sgpr_info, > &args); > } > > @@ -1076,7 +1143,7 @@ static void create_function(struct > nir_to_llvm_context *ctx, > break; > case MESA_SHADER_VERTEX: > set_vs_specific_input_locs(ctx, stage, has_previous_stage, > - previous_stage, &user_sgpr_idx); > + previous_stage, > &user_sgpr_info, &user_sgpr_idx); > if (ctx->view_index) > set_loc_shader(ctx, AC_UD_VIEW_INDEX, > &user_sgpr_idx, 1); > if (ctx->options->key.vs.as_ls) { > @@ -1088,7 +1155,7 @@ static void create_function(struct > nir_to_llvm_context *ctx, > break; > case MESA_SHADER_TESS_CTRL: > set_vs_specific_input_locs(ctx, stage, has_previous_stage, > - previous_stage, &user_sgpr_idx); > + previous_stage, > &user_sgpr_info, &user_sgpr_idx); > if (has_previous_stage) > set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT, > &user_sgpr_idx, 1); > @@ -1108,6 +1175,7 @@ static void create_function(struct > nir_to_llvm_context *ctx, > set_vs_specific_input_locs(ctx, stage, > > has_previous_stage, > previous_stage, > + &user_sgpr_info, > &user_sgpr_idx); > else > set_loc_shader(ctx, > AC_UD_TES_OFFCHIP_LAYOUT, > @@ -2371,9 +2439,23 @@ static LLVMValueRef visit_load_push_constant(struct > nir_to_llvm_context *ctx, > nir_intrinsic_instr *instr) > { > LLVMValueRef ptr, addr; > + LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]); > + unsigned index = nir_intrinsic_base(instr); > + > + if (LLVMIsConstant(src0)) { > + unsigned array_index = index; > + array_index += LLVMConstIntGetZExtValue(src0); > + array_index /= 4; > + > + array_index -= ctx->shader_info->inline_push_const_base; > + uint64_t bits = (((1ULL << instr->num_components) - 1) << > array_index); > + if ((bits & ctx->shader_info->inline_push_const_mask) == > bits) { > + return ac_build_gather_values(&ctx->ac, > &ctx->inline_push_consts[array_index], instr->num_components); > + } > + } > > - addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0); > - addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, > instr->src[0]), ""); > + addr = LLVMConstInt(ctx->ac.i32, index, 0); > + addr = LLVMBuildAdd(ctx->builder, addr, src0, ""); > > ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr); > ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa)); > diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_ > llvm.h > index b3ad0a09857..22330fdfbc4 100644 > --- a/src/amd/common/ac_nir_to_llvm.h > +++ b/src/amd/common/ac_nir_to_llvm.h > @@ -127,10 +127,13 @@ enum ac_ud_index { > > // Match MAX_SETS from radv_descriptor_set.h > #define AC_UD_MAX_SETS MAX_SETS > +#define AC_UD_MAX_INLINE_PUSH_CONST 12 > > struct ac_userdata_locations { > struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS]; > struct ac_userdata_info shader_data[AC_UD_MAX_UD]; > + struct ac_userdata_info inline_push_consts[AC_UD_MAX_ > INLINE_PUSH_CONST]; > + uint8_t push_const_base; > }; > > struct ac_vs_output_info { > @@ -156,6 +159,8 @@ struct ac_shader_variant_info { > unsigned num_user_sgprs; > unsigned num_input_sgprs; > unsigned num_input_vgprs; > + uint64_t inline_push_const_mask; > + uint32_t inline_push_const_base; > bool need_indirect_descriptor_sets; > struct { > struct { > diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_ > info.c > index 18fa9e1c94c..fbb46684ae3 100644 > --- a/src/amd/common/ac_shader_info.c > +++ b/src/amd/common/ac_shader_info.c > @@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir, > { > struct nir_function *func = (struct nir_function > *)exec_list_get_head(&nir->functions); > > - > - if (options->layout->dynamic_offset_count) > + if (options->layout->dynamic_offset_count) { > info->loads_push_constants = true; > + info->loads_dynamic_offsets = true; > + } > > nir_foreach_variable(variable, &nir->inputs) > gather_info_input_decl(nir, options, variable, info); > diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_ > info.h > index e35cde0ca97..e8ea33f2e3a 100644 > --- a/src/amd/common/ac_shader_info.h > +++ b/src/amd/common/ac_shader_info.h > @@ -32,6 +32,7 @@ struct ac_shader_info { > uint8_t min_push_constant_used; > uint8_t max_push_constant_used; > bool has_indirect_push_constants; > + bool loads_dynamic_offsets; > bool loads_push_constants; > bool needs_multiview_view_index; > bool uses_invocation_id; > diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_ > buffer.c > index d870a9bedb3..e1953a3a095 100644 > --- a/src/amd/vulkan/radv_cmd_buffer.c > +++ b/src/amd/vulkan/radv_cmd_buffer.c > @@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer > *cmd_buffer, > assert(cmd_buffer->cs->cdw <= cdw_max); > } > > +static struct ac_userdata_info * > +radv_lookup_push_const_sgpr(struct radv_shader_variant *shader, > + int idx) > +{ > + idx -= shader->info.user_sgprs_locs.push_const_base; > + return &shader->info.user_sgprs_locs.inline_push_consts[idx]; > +} > + > +static void > +radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer, > + struct radv_shader_variant *shader, > + unsigned base_reg, > + int idx, int count, uint32_t *values) > +{ > + struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, > idx); > + assert (loc->sgpr_idx == -1); > + assert (!loc->indirect); > + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * > 4, count); > + radeon_emit_array(cmd_buffer->cs, values, count); > +} > + > static void > radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, > struct radv_pipeline *pipeline, > @@ -1816,36 +1837,56 @@ radv_flush_constants(struct radv_cmd_buffer > *cmd_buffer, > unsigned offset; > void *ptr; > uint64_t va; > + bool need_push_constants = false; > > stages &= cmd_buffer->push_constant_stages; > if (!stages || > (!layout->push_constant_size && !layout->dynamic_offset_count) > ) > return; > > - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, > layout->push_constant_size + > - 16 * > layout->dynamic_offset_count, > - 256, &offset, &ptr)) > - return; > + radv_foreach_stage(stage, stages) { > + if (!pipeline->shaders[stage]) > + continue; > + > + need_push_constants |= pipeline->shaders[stage]-> > info.info.loads_push_constants; > + need_push_constants |= pipeline->shaders[stage]-> > info.info.loads_dynamic_offsets; > > - memcpy(ptr, cmd_buffer->push_constants, > layout->push_constant_size); > - if (layout->dynamic_offset_count) { > - memcpy((char*)ptr + layout->push_constant_size, > cmd_buffer->dynamic_buffers, > - 16 * layout->dynamic_offset_count); > + uint32_t mask = pipeline->shaders[stage]-> > info.inline_push_const_mask; > + uint32_t base_reg = pipeline->user_data_0[stage]; > + while (mask) { > + int start, count; > + u_bit_scan_consecutive_range(&mask, &start, > &count); > + start += pipeline->shaders[stage]-> > info.inline_push_const_base; > + radv_emit_inline_pushconsts(cmd_buffer, > pipeline->shaders[stage], base_reg, > + start, count, > (uint32_t *)&cmd_buffer->push_constants[start * 4]); > + } > } > > - va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); > - va += offset; > + if (need_push_constants) { > + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, > layout->push_constant_size + > + 16 * > layout->dynamic_offset_count, > + 256, &offset, &ptr)) > + return; > > - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer- > >device->ws, > - cmd_buffer->cs, > MESA_SHADER_STAGES * 4); > + memcpy(ptr, cmd_buffer->push_constants, > layout->push_constant_size); > + if (layout->dynamic_offset_count) { > + memcpy((char*)ptr + layout->push_constant_size, > cmd_buffer->dynamic_buffers, > + 16 * layout->dynamic_offset_count); > + } > > - radv_foreach_stage(stage, stages) { > - if (pipeline->shaders[stage]) { > - radv_emit_userdata_address(cmd_buffer, pipeline, > stage, > - AC_UD_PUSH_CONSTANTS, > va); > + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); > + va += offset; > + > + MAYBE_UNUSED unsigned cdw_max = > radeon_check_space(cmd_buffer->device->ws, > + > cmd_buffer->cs, MESA_SHADER_STAGES * 4); > + > + radv_foreach_stage(stage, stages) { > + if (pipeline->shaders[stage]) { > + radv_emit_userdata_address(cmd_buffer, > pipeline, stage, > + > AC_UD_PUSH_CONSTANTS, va); > + } > } > } > - > cmd_buffer->push_constant_stages &= ~stages; > assert(cmd_buffer->cs->cdw <= cdw_max); > } > -- > 2.14.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev