On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset <samuel.pitoi...@gmail.com> wrote:
> RadeonSI does something similar, the VGPRs decrease is a win > but not sure if we really want to implement that. > > Polaris10: > Totals from affected shaders: > SGPRS: 116376 -> 116768 (0.34 %) > VGPRS: 76556 -> 74868 (-2.20 %) > Spilled SGPRs: 10347 -> 10466 (1.15 %) > Code Size: 5555072 -> 5569024 (0.25 %) bytes > Max Waves: 9854 -> 9951 (0.98 %) > > Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> > --- > src/amd/common/ac_nir_to_llvm.c | 118 ++++++++++++++++++++++++++++++ > +--------- > src/amd/common/ac_shader_abi.h | 7 +++ > 2 files changed, 98 insertions(+), 27 deletions(-) > > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_ > llvm.c > index 644c85e2eb..eb0935972d 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct > ac_nir_context *ctx, > nir_intrinsic_instr *instr) > { > LLVMValueRef values[8]; > + int location = instr->variables[0]->var->data.location; > int idx = instr->variables[0]->var->data.driver_location; > int ve = instr->dest.ssa.num_components; > unsigned comp = instr->variables[0]->var->data.location_frac; > @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct > ac_nir_context *ctx, > > instr->num_components, vertex_index, const_index, type); > } > > + LLVMValueRef inputs[4]; > + > + if (ctx->stage == MESA_SHADER_FRAGMENT) { > + ctx->abi->load_fs_inputs(ctx->abi, location, > + indir_index, const_index, > + stride, inputs); > load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't get here? Marek > + } else { > + unsigned index = idx + > + (indir_index ? 0 : const_index * stride); > + > + memcpy(inputs, &ctx->abi->inputs[index], > sizeof(inputs)); > + } > + > for (unsigned chan = comp; chan < ve + comp; chan++) { > if (indir_index) { > unsigned count = > glsl_count_attribute_slots( > @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct > ac_nir_context *ctx, > ctx->stage == > MESA_SHADER_VERTEX); > count -= chan / 4; > LLVMValueRef tmp_vec = > ac_build_gather_values_extended( > - &ctx->ac, ctx->abi->inputs > + idx + chan, count, > + &ctx->ac, inputs + chan, > count, > stride, false, true); > > values[chan] = > LLVMBuildExtractElement(ctx->ac.builder, > > tmp_vec, > > indir_index, ""); > - } else > - values[chan] = ctx->abi->inputs[idx + chan > + const_index * stride]; > + } else { > + values[chan] = inputs[chan]; > + } > } > break; > case nir_var_local: > @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context > *ctx, > } > } > > +static unsigned > +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx) > +{ > + struct ac_shader_info *info = &ctx->shader_info->info; > + uint64_t mask = info->input_mask & ((1ull << idx) - 1); > + > + mask &= ~(1ull << VARYING_SLOT_POS); > + > + return util_bitcount64(mask); > +} > + > +/* If this is true, preload FS inputs at the beginning of shaders. > Otherwise, > + * reload them at each use. This must be true if the shader is using > + * derivatives and KILL, because KILL can leave the WQM and then a lazy > + * input load isn't in the WQM anymore. > + */ > +static bool > +radv_preload_fs_inputs(struct radv_shader_context *ctx) > +{ > + return ctx->shader_info->info.ps.uses_derivatives && > + ctx->shader_info->info.ps.uses_kill; > +} > + > static void > -handle_fs_inputs(struct radv_shader_context *ctx, > - struct nir_shader *nir) > +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx, > + LLVMValueRef out[4]) > { > struct ac_shader_info *info = &ctx->shader_info->info; > > + if (idx >= VARYING_SLOT_VAR0 || > + idx == VARYING_SLOT_PNTC || > + idx == VARYING_SLOT_PRIMITIVE_ID || > + idx == VARYING_SLOT_LAYER) { > + unsigned interp_mode = info->ps.input_interp_mode[idx]; > + unsigned interp_loc = info->ps.input_interp_loc[idx]; > + unsigned hw_index = get_input_hw_index(ctx, idx); > + LLVMValueRef interp_param = > + lookup_interp_param(&ctx->abi, interp_mode, > interp_loc); > + > + interp_fs_input(ctx, hw_index, interp_param, > ctx->abi.prim_mask, > + &out[0]); > + } else if (idx == VARYING_SLOT_POS) { > + for (int i = 0; i < 3; ++i) > + out[i] = ctx->abi.frag_pos[i]; > + > + out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, > + ctx->abi.frag_pos[3]); > + } > +} > + > +static void > +load_fs_inputs(struct ac_shader_abi *abi, > + unsigned location, > + LLVMValueRef indir_index, > + unsigned const_index, > + unsigned stride, > + LLVMValueRef out[4]) > +{ > + struct radv_shader_context *ctx = radv_shader_context_from_abi( > abi); > + > + if (!radv_preload_fs_inputs(ctx)) { > + radv_load_fs_inputs(ctx, location, out); > + } else { > + unsigned index = radeon_llvm_reg_index_soa(location, 0); > + > + index += (indir_index ? 0 : const_index * stride); > + > + memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4); > + } > +} > + > +static void > +handle_fs_inputs(struct radv_shader_context *ctx, > + struct nir_shader *nir) > +{ > prepare_interp_optimize(ctx, nir); > > nir_foreach_variable(variable, &nir->inputs) > handle_fs_input_decl(ctx, variable); > > - unsigned index = 0; > - > for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { > - LLVMValueRef interp_param; > LLVMValueRef *inputs = ctx->inputs > +radeon_llvm_reg_index_soa(i, 0); > > if (!(ctx->shader_info->info.input_mask & (1ull << i))) > continue; > > - if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || > - i == VARYING_SLOT_PRIMITIVE_ID || i == > VARYING_SLOT_LAYER) { > - unsigned interp_mode = > info->ps.input_interp_mode[i]; > - unsigned interp_loc = info->ps.input_interp_loc[i]; > - > - interp_param = lookup_interp_param(&ctx->abi, > interp_mode, > - interp_loc); > - > - interp_fs_input(ctx, index, interp_param, > ctx->abi.prim_mask, > - inputs); > - > - ++index; > - } else if (i == VARYING_SLOT_POS) { > - for(int i = 0; i < 3; ++i) > - inputs[i] = ctx->abi.frag_pos[i]; > - > - inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, > - ctx->abi.frag_pos[3]); > - } > + radv_load_fs_inputs(ctx, i, inputs); > } > > if (ctx->shader_info->info.needs_multiview_view_index) > @@ -6924,6 +6987,7 @@ LLVMModuleRef > ac_translate_nir_to_llvm(LLVMTargetMachineRef > tm, > ctx.abi.load_base_vertex = radv_load_base_vertex; > } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) > { > shader_info->fs.can_discard = > shaders[i]->info.fs.uses_discard; > + ctx.abi.load_fs_inputs = load_fs_inputs; > ctx.abi.lookup_interp_param = lookup_interp_param; > ctx.abi.load_sample_position = > load_sample_position; > ctx.abi.load_sample_mask_in = load_sample_mask_in; > diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_ > abi.h > index 901e49b1f9..8e51ce9fdd 100644 > --- a/src/amd/common/ac_shader_abi.h > +++ b/src/amd/common/ac_shader_abi.h > @@ -97,6 +97,13 @@ struct ac_shader_abi { > unsigned const_index, > LLVMTypeRef type); > > + void (*load_fs_inputs)(struct ac_shader_abi *abi, > + unsigned location, > + LLVMValueRef indir_index, > + unsigned const_index, > + unsigned stride, > + LLVMValueRef out[4]); > + > LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, > LLVMTypeRef type, > LLVMValueRef vertex_index, > -- > 2.16.2 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev