From: Marek Olšák <marek.ol...@amd.com> The LLVM compiler can CSE interp intrinsics thanks to LLVMReadNoneAttribute.
26011 shaders in 14651 tests Totals: SGPRS: 1146340 -> 1132676 (-1.19 %) VGPRS: 727371 -> 711730 (-2.15 %) Spilled SGPRs: 2218 -> 2078 (-6.31 %) Spilled VGPRs: 369 -> 369 (0.00 %) Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread Code Size: 35841268 -> 36009732 (0.47 %) bytes LDS: 767 -> 767 (0.00 %) blocks Max Waves: 222559 -> 224779 (1.00 %) Wait states: 0 -> 0 (0.00 %) v2: don't call load_input for fragment shaders in emit_declaration --- src/gallium/drivers/radeon/radeon_llvm.h | 6 ++++- .../drivers/radeon/radeon_setup_tgsi_llvm.c | 30 ++++++++++++++++++---- src/gallium/drivers/radeonsi/si_shader.c | 27 ++++++++----------- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index da5b7f5..f508d32 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -23,21 +23,23 @@ * Authors: Tom Stellard <thomas.stell...@amd.com> * */ #ifndef RADEON_LLVM_H #define RADEON_LLVM_H #include <llvm-c/Core.h> #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_tgsi.h" +#include "tgsi/tgsi_parse.h" +#define RADEON_LLVM_MAX_INPUT_SLOTS 32 #define RADEON_LLVM_MAX_INPUTS 32 * 4 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4 #define RADEON_LLVM_INITIAL_CF_DEPTH 4 #define RADEON_LLVM_MAX_SYSTEM_VALUES 4 struct radeon_llvm_branch { LLVMBasicBlockRef endif_block; LLVMBasicBlockRef if_block; @@ -55,33 +57,35 @@ struct radeon_llvm_context { /*=== Front end configuration ===*/ /* Instructions that are not described by any of the TGSI opcodes. */ /** This function is responsible for initilizing the inputs array and will be * called once for each input declared in the TGSI shader. */ void (*load_input)(struct radeon_llvm_context *, unsigned input_index, - const struct tgsi_full_declaration *decl); + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]); void (*load_system_value)(struct radeon_llvm_context *, unsigned index, const struct tgsi_full_declaration *decl); void (*declare_memory_region)(struct radeon_llvm_context *, const struct tgsi_full_declaration *decl); /** This array contains the input values for the shader. Typically these * values will be in the form of a target intrinsic that will inform the * backend how to load the actual inputs to the shader. */ + struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS]; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; /** This pointer is used to contain the temporary values. * The amount of temporary used in tgsi can't be bound to a max value and * thus we must allocate this array at runtime. */ LLVMValueRef *temps; unsigned temps_count; LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES]; diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 4643e6d..4fa43cd 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -439,28 +439,43 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, bld_base->int_bld.zero); result = LLVMConstInsertElement(result, bld->immediates[reg->Register.Index][swizzle + 1], bld_base->int_bld.one); return LLVMConstBitCast(result, ctype); } else { return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype); } } - case TGSI_FILE_INPUT: - result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]; + case TGSI_FILE_INPUT: { + unsigned index = reg->Register.Index; + LLVMValueRef input[4]; + + /* I don't think doing this for vertex shaders is beneficial. + * For those, we want to make sure the VMEM loads are executed + * only once. Fragment shaders don't care much, because + * v_interp instructions are much cheaper than VMEM loads. + */ + if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT) + ctx->load_input(ctx, index, &ctx->input_decls[index], input); + else + memcpy(input, &ctx->inputs[index * 4], sizeof(input)); + + result = input[swizzle]; + if (tgsi_type_is_64bit(type)) { ptr = result; - ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)]; + ptr2 = input[swizzle + 1]; return radeon_llvm_emit_fetch_64bit(bld_base, type, ptr, ptr2); } break; + } case TGSI_FILE_TEMPORARY: if (reg->Register.Index >= ctx->temps_count) return LLVMGetUndef(tgsi2llvmtype(bld_base, type)); ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle]; if (tgsi_type_is_64bit(type)) { ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1]; return radeon_llvm_emit_fetch_64bit(bld_base, type, LLVMBuildLoad(builder, ptr, ""), LLVMBuildLoad(builder, ptr2, "")); @@ -619,22 +634,27 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base, } ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr; } } break; } case TGSI_FILE_INPUT: { unsigned idx; for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - if (ctx->load_input) - ctx->load_input(ctx, idx, decl); + if (ctx->load_input) { + ctx->input_decls[idx] = *decl; + + if (bld_base->info->processor != PIPE_SHADER_FRAGMENT) + ctx->load_input(ctx, idx, decl, + &ctx->inputs[idx * 4]); + } } } break; case TGSI_FILE_SYSTEM_VALUE: { unsigned idx; for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { ctx->load_system_value(ctx, idx, decl); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8432e45..38e2e0e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -433,21 +433,22 @@ static LLVMValueRef get_instance_index_for_fetch( result = LLVMBuildUDiv(gallivm->builder, result, lp_build_const_int32(gallivm, divisor), ""); return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(radeon_bld->main_fn, param_start_instance), ""); } static void declare_input_vs( struct radeon_llvm_context *radeon_bld, unsigned input_index, - const struct tgsi_full_declaration *decl) + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) { struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct gallivm_state *gallivm = base->gallivm; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); unsigned divisor = ctx->shader->key.vs.prolog.instance_divisors[input_index]; unsigned chan; @@ -491,25 +492,22 @@ static void declare_input_vs( args[0] = t_list; args[1] = attribute_offset; args[2] = buffer_index; input = lp_build_intrinsic(gallivm->builder, "llvm.SI.vs.load.input", ctx->v4f32, args, 3, LLVMReadNoneAttribute); /* Break up the vec4 into individual components */ for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); - /* XXX: Use a helper function for this. There is one in - * tgsi_llvm.c. */ - ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] = - LLVMBuildExtractElement(gallivm->builder, - input, llvm_chan, ""); + out[chan] = LLVMBuildExtractElement(gallivm->builder, + input, llvm_chan, ""); } } static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base, unsigned swizzle) { struct si_shader_context *ctx = si_shader_context(bld_base); if (swizzle > 0) return bld_base->uint_bld.zero; @@ -1456,47 +1454,44 @@ static LLVMValueRef get_interp_param(struct si_shader_context *ctx, } if (!param) param = LLVMGetParam(main_fn, interp_param_idx); return param; } static void declare_input_fs( struct radeon_llvm_context *radeon_bld, unsigned input_index, - const struct tgsi_full_declaration *decl) + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) { struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct si_shader_context *ctx = si_shader_context(&radeon_bld->soa.bld_base); struct si_shader *shader = ctx->shader; LLVMValueRef main_fn = radeon_bld->main_fn; LLVMValueRef interp_param = NULL; int interp_param_idx; /* Get colors from input VGPRs (set by the prolog). */ if (!ctx->is_monolithic && decl->Semantic.Name == TGSI_SEMANTIC_COLOR) { unsigned i = decl->Semantic.Index; unsigned colors_read = shader->selector->info.colors_read; unsigned mask = colors_read >> (i * 4); unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + (i ? util_bitcount(colors_read & 0xf) : 0); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = - mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = - mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = - mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = - mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; return; } interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate, decl->Interp.Location); if (interp_param_idx == -1) return; else if (interp_param_idx) { interp_param_idx = select_interp_param(ctx, interp_param_idx); @@ -1506,21 +1501,21 @@ static void declare_input_fs( if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR && decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR && ctx->shader->key.ps.prolog.flatshade_colors) interp_param = NULL; /* load the constant color */ interp_fs_input(ctx, input_index, decl->Semantic.Name, decl->Semantic.Index, shader->selector->info.num_inputs, shader->selector->info.colors_read, interp_param, LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), - &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]); + &out[0]); } static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld) { return unpack_param(si_shader_context(&radeon_bld->soa.bld_base), SI_PARAM_ANCILLARY, 8, 4); } /** * Set range metadata on an instruction. This can only be used on load and -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev