From: Marek Olšák <marek.ol...@amd.com> The supported counts are 1, 2, 4. (3=4)
The following snippet loads float, vec2, vec3, and vec4: Before: buffer_load_format_x v9, v4, s[0:3], 0 idxen ; E0002000 80000904 buffer_load_format_xyzw v[0:3], v5, s[8:11], 0 idxen ; E00C2000 80020005 s_waitcnt vmcnt(0) ; BF8C0F70 buffer_load_format_xyzw v[2:5], v6, s[12:15], 0 idxen ; E00C2000 80030206 s_waitcnt vmcnt(0) ; BF8C0F70 buffer_load_format_xyzw v[5:8], v7, s[4:7], 0 idxen ; E00C2000 80010507 After: buffer_load_format_x v10, v4, s[0:3], 0 idxen ; E0002000 80000A04 buffer_load_format_xy v[8:9], v5, s[8:11], 0 idxen ; E0042000 80020805 buffer_load_format_xyzw v[0:3], v6, s[12:15], 0 idxen ; E00C2000 80030006 s_waitcnt vmcnt(0) ; BF8C0F70 buffer_load_format_xyzw v[3:6], v7, s[4:7], 0 idxen ; E00C2000 80010307 --- src/amd/common/ac_llvm_build.c | 35 +++++++++++++++++++++++ src/amd/common/ac_llvm_build.h | 3 ++ src/gallium/drivers/radeonsi/si_shader.c | 13 +++++++-- src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 8 ++++-- 4 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 6afe7f9..a5cb72d 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -454,20 +454,55 @@ ac_build_gather_values_extended(struct ac_llvm_context *ctx, } LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, unsigned value_count) { return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false); } +/* Expand a scalar or vector to <4 x type> by filling the remaining channels + * with undef. Extract at most num_channels components from the input. + */ +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned num_channels) +{ + LLVMTypeRef elemtype; + LLVMValueRef chan[4]; + + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { + unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); + num_channels = MIN2(num_channels, vec_size); + + if (num_channels >= 4) + return value; + + for (unsigned i = 0; i < num_channels; i++) + chan[i] = ac_llvm_extract_elem(ctx, value, i); + + elemtype = LLVMGetElementType(LLVMTypeOf(value)); + } else { + if (num_channels) { + assert(num_channels == 1); + chan[0] = value; + } + elemtype = LLVMTypeOf(value); + } + + while (num_channels < 4) + chan[num_channels++] = LLVMGetUndef(elemtype); + + return ac_build_gather_values(ctx, chan, 4); +} + LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den) { LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, ""); /* Use v_rcp_f32 instead of precise division. */ if (!LLVMIsConstant(ret)) LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp); diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 78437d6..3ae9678 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -134,20 +134,23 @@ LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, unsigned value_count, unsigned value_stride, bool load, bool always_vector); LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values, unsigned value_count); +LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, + LLVMValueRef value, + unsigned num_channels); LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den); void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod, LLVMValueRef *coords_arg, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index de1f725..bc621af 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -473,22 +473,22 @@ static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""), ctx->i32, ""); } void si_llvm_load_input_vs( struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4]) { - unsigned vs_blit_property = - ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + const struct tgsi_shader_info *info = &ctx->shader->selector->info; + unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; if (vs_blit_property) { LLVMValueRef vertex_id = ctx->abi.vertex_id; LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->i32_1, ""); /* Use LLVMIntNE, because we have 3 vertices and only * the middle one should use y2. */ LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, @@ -548,20 +548,21 @@ void si_llvm_load_input_vs( out[3] = LLVMGetParam(ctx->main_fn, ctx->param_vs_blit_inputs + 8); } return; } unsigned chan; unsigned fix_fetch; unsigned num_fetches; unsigned fetch_stride; + unsigned num_channels; LLVMValueRef t_list_ptr; LLVMValueRef t_offset; LLVMValueRef t_list; LLVMValueRef vertex_index; LLVMValueRef input[3]; /* Load the T list */ t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); @@ -573,46 +574,52 @@ void si_llvm_load_input_vs( ctx->param_vertex_index0 + input_index); fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; /* Do multiple loads for special formats. */ switch (fix_fetch) { case SI_FIX_FETCH_RGB_64_FLOAT: num_fetches = 3; /* 3 2-dword loads */ fetch_stride = 8; + num_channels = 2; break; case SI_FIX_FETCH_RGBA_64_FLOAT: num_fetches = 2; /* 2 4-dword loads */ fetch_stride = 16; + num_channels = 4; break; case SI_FIX_FETCH_RGB_8: case SI_FIX_FETCH_RGB_8_INT: num_fetches = 3; fetch_stride = 1; + num_channels = 1; break; case SI_FIX_FETCH_RGB_16: case SI_FIX_FETCH_RGB_16_INT: num_fetches = 3; fetch_stride = 2; + num_channels = 1; break; default: num_fetches = 1; fetch_stride = 0; + num_channels = util_last_bit(info->input_usage_mask[input_index]); } for (unsigned i = 0; i < num_fetches; i++) { LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); input[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset, - 4, true); + num_channels, true); + input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels); } /* Break up the vec4 into individual components */ for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input[0], llvm_chan, ""); } switch (fix_fetch) { diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c index 58f3bda..cdd7c16 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c @@ -1814,26 +1814,30 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); const struct tgsi_full_instruction *inst = emit_data->inst; struct ac_image_args args; unsigned opcode = inst->Instruction.Opcode; unsigned target = inst->Texture.Texture; if (target == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = + unsigned num_channels = + util_last_bit(inst->Dst[0].Register.WriteMask); + LLVMValueRef result = ac_build_buffer_load_format(&ctx->ac, emit_data->args[0], emit_data->args[2], emit_data->args[1], - 4, true); + num_channels, true); + emit_data->output[emit_data->chan] = + ac_build_expand_to_vec4(&ctx->ac, result, num_channels); return; } memcpy(&args, emit_data->args, sizeof(args)); /* ugly */ args.opcode = ac_image_sample; args.compare = tgsi_is_shadow_target(target); args.offset = inst->Texture.NumOffsets > 0; switch (opcode) { -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev