[Mesa-dev] [PATCH] radeonsi: reload PS inputs with direct indexing at each use (v2)

Marek Olšák Tue, 13 Sep 2016 13:21:01 -0700

From: Marek Olšák <marek.ol...@amd.com>

The LLVM compiler can CSE interp intrinsics thanks to
LLVMReadNoneAttribute.


26011 shaders in 14651 tests
Totals:
SGPRS: 1146340 -> 1132676 (-1.19 %)
VGPRS: 727371 -> 711730 (-2.15 %)
Spilled SGPRs: 2218 -> 2078 (-6.31 %)
Spilled VGPRs: 369 -> 369 (0.00 %)
Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread
Code Size: 35841268 -> 36009732 (0.47 %) bytes
LDS: 767 -> 767 (0.00 %) blocks
Max Waves: 222559 -> 224779 (1.00 %)
Wait states: 0 -> 0 (0.00 %)

v2: don't call load_input for fragment shaders in emit_declaration
---
 src/gallium/drivers/radeon/radeon_llvm.h           |  6 ++++-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 30 ++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader.c           | 27 ++++++++-----------
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h 
b/src/gallium/drivers/radeon/radeon_llvm.h
index da5b7f5..f508d32 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -23,21 +23,23 @@
  * Authors: Tom Stellard <thomas.stell...@amd.com>
  *
  */
 
 #ifndef RADEON_LLVM_H
 #define RADEON_LLVM_H
 
 #include <llvm-c/Core.h>
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_tgsi.h"
+#include "tgsi/tgsi_parse.h"
 
+#define RADEON_LLVM_MAX_INPUT_SLOTS 32
 #define RADEON_LLVM_MAX_INPUTS 32 * 4
 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
 
 #define RADEON_LLVM_INITIAL_CF_DEPTH 4
 
 #define RADEON_LLVM_MAX_SYSTEM_VALUES 4
 
 struct radeon_llvm_branch {
        LLVMBasicBlockRef endif_block;
        LLVMBasicBlockRef if_block;
@@ -55,33 +57,35 @@ struct radeon_llvm_context {
 
        /*=== Front end configuration ===*/
 
        /* Instructions that are not described by any of the TGSI opcodes. */
 
        /** This function is responsible for initilizing the inputs array and 
will be
          * called once for each input declared in the TGSI shader.
          */
        void (*load_input)(struct radeon_llvm_context *,
                           unsigned input_index,
-                          const struct tgsi_full_declaration *decl);
+                          const struct tgsi_full_declaration *decl,
+                          LLVMValueRef out[4]);
 
        void (*load_system_value)(struct radeon_llvm_context *,
                                  unsigned index,
                                  const struct tgsi_full_declaration *decl);
 
        void (*declare_memory_region)(struct radeon_llvm_context *,
                                      const struct tgsi_full_declaration *decl);
 
        /** This array contains the input values for the shader.  Typically 
these
          * values will be in the form of a target intrinsic that will inform 
the
          * backend how to load the actual inputs to the shader. 
          */
+       struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS];
        LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
        LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
 
        /** This pointer is used to contain the temporary values.
          * The amount of temporary used in tgsi can't be bound to a max value 
and
          * thus we must allocate this array at runtime.
          */
        LLVMValueRef *temps;
        unsigned temps_count;
        LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 4643e6d..4fa43cd 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -439,28 +439,43 @@ LLVMValueRef radeon_llvm_emit_fetch(struct 
lp_build_tgsi_context *bld_base,
                                                        bld_base->int_bld.zero);
                        result = LLVMConstInsertElement(result,
                                                        
bld->immediates[reg->Register.Index][swizzle + 1],
                                                        bld_base->int_bld.one);
                        return LLVMConstBitCast(result, ctype);
                } else {
                        return 
LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
                }
        }
 
-       case TGSI_FILE_INPUT:
-               result = 
ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+       case TGSI_FILE_INPUT: {
+               unsigned index = reg->Register.Index;
+               LLVMValueRef input[4];
+
+               /* I don't think doing this for vertex shaders is beneficial.
+                * For those, we want to make sure the VMEM loads are executed
+                * only once. Fragment shaders don't care much, because
+                * v_interp instructions are much cheaper than VMEM loads.
+                */
+               if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT)
+                       ctx->load_input(ctx, index, &ctx->input_decls[index], 
input);
+               else
+                       memcpy(input, &ctx->inputs[index * 4], sizeof(input));
+
+               result = input[swizzle];
+
                if (tgsi_type_is_64bit(type)) {
                        ptr = result;
-                       ptr2 = 
ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
+                       ptr2 = input[swizzle + 1];
                        return radeon_llvm_emit_fetch_64bit(bld_base, type, 
ptr, ptr2);
                }
                break;
+       }
 
        case TGSI_FILE_TEMPORARY:
                if (reg->Register.Index >= ctx->temps_count)
                        return LLVMGetUndef(tgsi2llvmtype(bld_base, type));
                ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + 
swizzle];
                if (tgsi_type_is_64bit(type)) {
                        ptr2 = ctx->temps[reg->Register.Index * 
TGSI_NUM_CHANNELS + swizzle + 1];
                        return radeon_llvm_emit_fetch_64bit(bld_base, type,
                                                 LLVMBuildLoad(builder, ptr, 
""),
                                                 LLVMBuildLoad(builder, ptr2, 
""));
@@ -619,22 +634,27 @@ static void emit_declaration(struct lp_build_tgsi_context 
*bld_base,
                                }
                                ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
                        }
                }
                break;
        }
        case TGSI_FILE_INPUT:
        {
                unsigned idx;
                for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
-                       if (ctx->load_input)
-                               ctx->load_input(ctx, idx, decl);
+                       if (ctx->load_input) {
+                               ctx->input_decls[idx] = *decl;
+
+                               if (bld_base->info->processor != 
PIPE_SHADER_FRAGMENT)
+                                       ctx->load_input(ctx, idx, decl,
+                                                       &ctx->inputs[idx * 4]);
+                       }
                }
        }
        break;
 
        case TGSI_FILE_SYSTEM_VALUE:
        {
                unsigned idx;
                for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
                        ctx->load_system_value(ctx, idx, decl);
                }
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 8432e45..38e2e0e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -433,21 +433,22 @@ static LLVMValueRef get_instance_index_for_fetch(
                result = LLVMBuildUDiv(gallivm->builder, result,
                                lp_build_const_int32(gallivm, divisor), "");
 
        return LLVMBuildAdd(gallivm->builder, result,
                            LLVMGetParam(radeon_bld->main_fn, 
param_start_instance), "");
 }
 
 static void declare_input_vs(
        struct radeon_llvm_context *radeon_bld,
        unsigned input_index,
-       const struct tgsi_full_declaration *decl)
+       const struct tgsi_full_declaration *decl,
+       LLVMValueRef out[4])
 {
        struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
        struct gallivm_state *gallivm = base->gallivm;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
        unsigned divisor =
                ctx->shader->key.vs.prolog.instance_divisors[input_index];
 
        unsigned chan;
 
@@ -491,25 +492,22 @@ static void declare_input_vs(
        args[0] = t_list;
        args[1] = attribute_offset;
        args[2] = buffer_index;
        input = lp_build_intrinsic(gallivm->builder,
                "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
                LLVMReadNoneAttribute);
 
        /* Break up the vec4 into individual components */
        for (chan = 0; chan < 4; chan++) {
                LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
-               /* XXX: Use a helper function for this.  There is one in
-                * tgsi_llvm.c. */
-               ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, 
chan)] =
-                               LLVMBuildExtractElement(gallivm->builder,
-                               input, llvm_chan, "");
+               out[chan] = LLVMBuildExtractElement(gallivm->builder,
+                                                   input, llvm_chan, "");
        }
 }
 
 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
                                     unsigned swizzle)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
 
        if (swizzle > 0)
                return bld_base->uint_bld.zero;
@@ -1456,47 +1454,44 @@ static LLVMValueRef get_interp_param(struct 
si_shader_context *ctx,
        }
 
        if (!param)
                param = LLVMGetParam(main_fn, interp_param_idx);
        return param;
 }
 
 static void declare_input_fs(
        struct radeon_llvm_context *radeon_bld,
        unsigned input_index,
-       const struct tgsi_full_declaration *decl)
+       const struct tgsi_full_declaration *decl,
+       LLVMValueRef out[4])
 {
        struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
        struct si_shader_context *ctx =
                si_shader_context(&radeon_bld->soa.bld_base);
        struct si_shader *shader = ctx->shader;
        LLVMValueRef main_fn = radeon_bld->main_fn;
        LLVMValueRef interp_param = NULL;
        int interp_param_idx;
 
        /* Get colors from input VGPRs (set by the prolog). */
        if (!ctx->is_monolithic &&
            decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
                unsigned i = decl->Semantic.Index;
                unsigned colors_read = shader->selector->info.colors_read;
                unsigned mask = colors_read >> (i * 4);
                unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
                                  (i ? util_bitcount(colors_read & 0xf) : 0);
 
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-                       mask & 0x1 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-                       mask & 0x2 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-                       mask & 0x4 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
-               radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-                       mask & 0x8 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
+               out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
+               out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
+               out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
+               out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : 
base->undef;
                return;
        }
 
        interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
                                                     decl->Interp.Location);
        if (interp_param_idx == -1)
                return;
        else if (interp_param_idx) {
                interp_param_idx = select_interp_param(ctx,
                                                       interp_param_idx);
@@ -1506,21 +1501,21 @@ static void declare_input_fs(
        if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
            decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
            ctx->shader->key.ps.prolog.flatshade_colors)
                interp_param = NULL; /* load the constant color */
 
        interp_fs_input(ctx, input_index, decl->Semantic.Name,
                        decl->Semantic.Index, shader->selector->info.num_inputs,
                        shader->selector->info.colors_read, interp_param,
                        LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
                        LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
-                       
&radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
+                       &out[0]);
 }
 
 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
 {
        return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
                            SI_PARAM_ANCILLARY, 8, 4);
 }
 
 /**
  * Set range metadata on an instruction.  This can only be used on load and
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radeonsi: reload PS inputs with direct indexing at each use (v2)

Reply via email to