[Mesa-dev] [PATCH 2/5] radeonsi: load the right number of components for VS inputs and TBOs

Marek Olšák Tue, 30 Jan 2018 13:46:40 -0800

From: Marek Olšák <marek.ol...@amd.com>

The supported counts are 1, 2, 4. (3=4)


The following snippet loads float, vec2, vec3, and vec4:

Before:
    buffer_load_format_x v9, v4, s[0:3], 0 idxen          ; E0002000 80000904
    buffer_load_format_xyzw v[0:3], v5, s[8:11], 0 idxen  ; E00C2000 80020005
    s_waitcnt vmcnt(0)                                    ; BF8C0F70
    buffer_load_format_xyzw v[2:5], v6, s[12:15], 0 idxen ; E00C2000 80030206
    s_waitcnt vmcnt(0)                                    ; BF8C0F70
    buffer_load_format_xyzw v[5:8], v7, s[4:7], 0 idxen   ; E00C2000 80010507

After:
    buffer_load_format_x v10, v4, s[0:3], 0 idxen         ; E0002000 80000A04
    buffer_load_format_xy v[8:9], v5, s[8:11], 0 idxen    ; E0042000 80020805
    buffer_load_format_xyzw v[0:3], v6, s[12:15], 0 idxen ; E00C2000 80030006
    s_waitcnt vmcnt(0)                                    ; BF8C0F70
    buffer_load_format_xyzw v[3:6], v7, s[4:7], 0 idxen   ; E00C2000 80010307
---
 src/amd/common/ac_llvm_build.c                    | 35 +++++++++++++++++++++++
 src/amd/common/ac_llvm_build.h                    |  3 ++
 src/gallium/drivers/radeonsi/si_shader.c          | 13 +++++++--
 src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c |  8 ++++--
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 6afe7f9..a5cb72d 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -454,20 +454,55 @@ ac_build_gather_values_extended(struct ac_llvm_context 
*ctx,
 }
 
 LLVMValueRef
 ac_build_gather_values(struct ac_llvm_context *ctx,
                       LLVMValueRef *values,
                       unsigned value_count)
 {
        return ac_build_gather_values_extended(ctx, values, value_count, 1, 
false, false);
 }
 
+/* Expand a scalar or vector to <4 x type> by filling the remaining channels
+ * with undef. Extract at most num_channels components from the input.
+ */
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+                                    LLVMValueRef value,
+                                    unsigned num_channels)
+{
+       LLVMTypeRef elemtype;
+       LLVMValueRef chan[4];
+
+       if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+               unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+               num_channels = MIN2(num_channels, vec_size);
+
+               if (num_channels >= 4)
+                       return value;
+
+               for (unsigned i = 0; i < num_channels; i++)
+                       chan[i] = ac_llvm_extract_elem(ctx, value, i);
+
+               elemtype = LLVMGetElementType(LLVMTypeOf(value));
+       } else {
+               if (num_channels) {
+                       assert(num_channels == 1);
+                       chan[0] = value;
+               }
+               elemtype = LLVMTypeOf(value);
+       }
+
+       while (num_channels < 4)
+               chan[num_channels++] = LLVMGetUndef(elemtype);
+
+       return ac_build_gather_values(ctx, chan, 4);
+}
+
 LLVMValueRef
 ac_build_fdiv(struct ac_llvm_context *ctx,
              LLVMValueRef num,
              LLVMValueRef den)
 {
        LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
 
        /* Use v_rcp_f32 instead of precise division. */
        if (!LLVMIsConstant(ret))
                LLVMSetMetadata(ret, ctx->fpmath_md_kind, 
ctx->fpmath_md_2p5_ulp);
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 78437d6..3ae9678 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -134,20 +134,23 @@ LLVMValueRef
 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
                                LLVMValueRef *values,
                                unsigned value_count,
                                unsigned value_stride,
                                bool load,
                                bool always_vector);
 LLVMValueRef
 ac_build_gather_values(struct ac_llvm_context *ctx,
                       LLVMValueRef *values,
                       unsigned value_count);
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+                                    LLVMValueRef value,
+                                    unsigned num_channels);
 
 LLVMValueRef
 ac_build_fdiv(struct ac_llvm_context *ctx,
              LLVMValueRef num,
              LLVMValueRef den);
 
 void
 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
                       bool is_deriv, bool is_array, bool is_lod,
                       LLVMValueRef *coords_arg,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index de1f725..bc621af 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -473,22 +473,22 @@ static LLVMValueRef unpack_sint16(struct 
si_shader_context *ctx,
                             LLVMBuildTrunc(ctx->ac.builder, i32,
                                            ctx->ac.i16, ""),
                             ctx->i32, "");
 }
 
 void si_llvm_load_input_vs(
        struct si_shader_context *ctx,
        unsigned input_index,
        LLVMValueRef out[4])
 {
-       unsigned vs_blit_property =
-               
ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
+       const struct tgsi_shader_info *info = &ctx->shader->selector->info;
+       unsigned vs_blit_property = 
info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
 
        if (vs_blit_property) {
                LLVMValueRef vertex_id = ctx->abi.vertex_id;
                LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
                                                    LLVMIntULE, vertex_id,
                                                    ctx->i32_1, "");
                /* Use LLVMIntNE, because we have 3 vertices and only
                 * the middle one should use y2.
                 */
                LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
@@ -548,20 +548,21 @@ void si_llvm_load_input_vs(
                        out[3] = LLVMGetParam(ctx->main_fn,
                                              ctx->param_vs_blit_inputs + 8);
                }
                return;
        }
 
        unsigned chan;
        unsigned fix_fetch;
        unsigned num_fetches;
        unsigned fetch_stride;
+       unsigned num_channels;
 
        LLVMValueRef t_list_ptr;
        LLVMValueRef t_offset;
        LLVMValueRef t_list;
        LLVMValueRef vertex_index;
        LLVMValueRef input[3];
 
        /* Load the T list */
        t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 
@@ -573,46 +574,52 @@ void si_llvm_load_input_vs(
                                    ctx->param_vertex_index0 +
                                    input_index);
 
        fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 
        /* Do multiple loads for special formats. */
        switch (fix_fetch) {
        case SI_FIX_FETCH_RGB_64_FLOAT:
                num_fetches = 3; /* 3 2-dword loads */
                fetch_stride = 8;
+               num_channels = 2;
                break;
        case SI_FIX_FETCH_RGBA_64_FLOAT:
                num_fetches = 2; /* 2 4-dword loads */
                fetch_stride = 16;
+               num_channels = 4;
                break;
        case SI_FIX_FETCH_RGB_8:
        case SI_FIX_FETCH_RGB_8_INT:
                num_fetches = 3;
                fetch_stride = 1;
+               num_channels = 1;
                break;
        case SI_FIX_FETCH_RGB_16:
        case SI_FIX_FETCH_RGB_16_INT:
                num_fetches = 3;
                fetch_stride = 2;
+               num_channels = 1;
                break;
        default:
                num_fetches = 1;
                fetch_stride = 0;
+               num_channels = 
util_last_bit(info->input_usage_mask[input_index]);
        }
 
        for (unsigned i = 0; i < num_fetches; i++) {
                LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 
0);
 
                input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
                                                       vertex_index, voffset,
-                                                      4, true);
+                                                      num_channels, true);
+               input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], 
num_channels);
        }
 
        /* Break up the vec4 into individual components */
        for (chan = 0; chan < 4; chan++) {
                LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
                out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
                                                    input[0], llvm_chan, "");
        }
 
        switch (fix_fetch) {
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c 
b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index 58f3bda..cdd7c16 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -1814,26 +1814,30 @@ static void build_tex_intrinsic(const struct 
lp_build_tgsi_action *action,
                                struct lp_build_tgsi_context *bld_base,
                                struct lp_build_emit_data *emit_data)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        const struct tgsi_full_instruction *inst = emit_data->inst;
        struct ac_image_args args;
        unsigned opcode = inst->Instruction.Opcode;
        unsigned target = inst->Texture.Texture;
 
        if (target == TGSI_TEXTURE_BUFFER) {
-               emit_data->output[emit_data->chan] =
+               unsigned num_channels =
+                       util_last_bit(inst->Dst[0].Register.WriteMask);
+               LLVMValueRef result =
                        ac_build_buffer_load_format(&ctx->ac,
                                                    emit_data->args[0],
                                                    emit_data->args[2],
                                                    emit_data->args[1],
-                                                   4, true);
+                                                   num_channels, true);
+               emit_data->output[emit_data->chan] =
+                       ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
                return;
        }
 
        memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
 
        args.opcode = ac_image_sample;
        args.compare = tgsi_is_shadow_target(target);
        args.offset = inst->Texture.NumOffsets > 0;
 
        switch (opcode) {
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/5] radeonsi: load the right number of components for VS inputs and TBOs

Reply via email to