From: Marek Olšák <marek.ol...@amd.com> --- src/amd/common/ac_llvm_build.c | 52 +++++++++++++++++-- src/amd/common/ac_llvm_build.h | 4 ++ .../drivers/radeonsi/si_shader_internal.h | 3 +- src/gallium/drivers/radeonsi/si_shader_nir.c | 12 ++++- .../drivers/radeonsi/si_shader_tgsi_mem.c | 23 ++++++-- 5 files changed, 84 insertions(+), 10 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 1c8d944db74..1f5112e9929 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -828,70 +828,112 @@ ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { LLVMValueRef indices[2] = { ctx->i32_0, index, }; return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); } +LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef index) +{ + return LLVMBuildPointerCast(ctx->builder, + ac_build_gep0(ctx, ptr, index), + LLVMTypeOf(ptr), ""); +} + void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, LLVMValueRef value) { LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index)); } /** * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad. * It's equivalent to doing a load from &base_ptr[index]. * * \param base_ptr Where the array starts. * \param index The element index into the array. * \param uniform Whether the base_ptr and index can be assumed to be * dynamically uniform (i.e. load to an SGPR) * \param invariant Whether the load is invariant (no other opcodes affect it) + * \param no_unsigned_wraparound + * For all possible re-associations and re-distributions of an expression + * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs + * without inbounds in base_ptr), this parameter is true if "addr + offset" + * does not result in an unsigned integer wraparound. This is used for + * optimal code generation of 32-bit pointer arithmetic. + * + * For example, a 32-bit immediate offset that causes a 32-bit unsigned + * integer wraparound can't be an imm offset in s_load_dword, because + * the instruction performs "addr + offset" in 64 bits. + * + * Expected usage for bindless textures by chaining GEPs: + * // possible unsigned wraparound, don't use InBounds: + * ptr1 = LLVMBuildGEP(base_ptr, index); + * image = load(ptr1); // becomes "s_load ptr1, 0" + * + * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize); + * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds */ static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, - LLVMValueRef index, bool uniform, bool invariant) + LLVMValueRef index, bool uniform, bool invariant, + bool no_unsigned_wraparound) { LLVMValueRef pointer, result; + LLVMValueRef indices[2] = {ctx->i32_0, index}; + + if (no_unsigned_wraparound && + LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_CONST_32BIT_ADDR_SPACE) + pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, ""); + else + pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, ""); - pointer = ac_build_gep0(ctx, base_ptr, index); if (uniform) LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md); result = LLVMBuildLoad(ctx->builder, pointer, ""); if (invariant) LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md); return result; } LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, false, false); + return ac_build_load_custom(ctx, base_ptr, index, false, false, false); } LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, false, true); + return ac_build_load_custom(ctx, base_ptr, index, false, true, false); } +/* This assumes that there is no unsigned integer wraparound during the address + * computation, excluding all GEPs within base_ptr. */ LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index) { - return ac_build_load_custom(ctx, base_ptr, index, true, true); + return ac_build_load_custom(ctx, base_ptr, index, true, true, true); +} + +/* See ac_build_load_custom() documentation. */ +LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index) +{ + return ac_build_load_custom(ctx, base_ptr, index, true, true, false); } /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4. * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2), * or v4i32 (num_channels=3,4). */ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index b080cca4cb7..0d261bae097 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -196,32 +196,36 @@ LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, LLVMValueRef llvm_chan, LLVMValueRef attr_number, LLVMValueRef params); LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); +LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef index); void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index, LLVMValueRef value); LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index); +LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, LLVMValueRef index); void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata, unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset, unsigned inst_offset, bool glc, diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index a638dbf8f1a..235c46ecf92 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -298,21 +298,22 @@ LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx, LLVMValueRef si_get_sample_id(struct si_shader_context *ctx); void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base); void si_shader_context_init_mem(struct si_shader_context *ctx); LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list, LLVMValueRef index, enum ac_descriptor_type type); LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type desc_type, bool dcc_off); + enum ac_descriptor_type desc_type, bool dcc_off, + bool bindless); void si_load_system_value(struct si_shader_context *ctx, unsigned index, const struct tgsi_full_declaration *decl); void si_declare_compute_memory(struct si_shader_context *ctx); void si_tgsi_declare_compute_memory(struct si_shader_context *ctx, const struct tgsi_full_declaration *decl); void si_llvm_load_input_vs( struct si_shader_context *ctx, diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 0aefca22385..5d6280b80f7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -913,31 +913,41 @@ si_nir_load_sampler_desc(struct ac_shader_abi *abi, assert(base_index + constant_index < ctx->num_images); if (dynamic_index) index = si_llvm_bound_index(ctx, index, ctx->num_images); index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), index, ""); /* TODO: be smarter about when we use dcc_off */ - return si_load_image_desc(ctx, list, index, desc_type, write); + return si_load_image_desc(ctx, list, index, desc_type, write, bindless); } assert(base_index + constant_index < ctx->num_samplers); if (dynamic_index) index = si_llvm_bound_index(ctx, index, ctx->num_samplers); index = LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), ""); + if (bindless) { + /* Since bindless handle arithmetic can contain an unsigned integer + * wraparound and si_load_sampler_desc assumes there isn't any, + * use GEP without "inbounds" (inside ac_build_pointer_add) + * to prevent incorrect code generation and hangs. + */ + index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); + list = ac_build_pointer_add(&ctx->ac, list, index); + index = ctx->i32_0; + } return si_load_sampler_desc(ctx, list, index, desc_type); } static void bitcast_inputs(struct si_shader_context *ctx, LLVMValueRef data[4], unsigned input_idx) { for (unsigned chan = 0; chan < 4; chan++) { ctx->inputs[input_idx + chan] = LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, ""); diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c index eaa200a95d6..cabc448a082 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c @@ -169,35 +169,40 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef tmp; tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); } } LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list, LLVMValueRef index, - enum ac_descriptor_type desc_type, bool dcc_off) + enum ac_descriptor_type desc_type, bool dcc_off, + bool bindless) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef rsrc; if (desc_type == AC_DESC_BUFFER) { index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0), ctx->i32_1); list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->v4i32), ""); } else { assert(desc_type == AC_DESC_IMAGE); } - rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); + if (bindless) + rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); + else + rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); + if (desc_type == AC_DESC_IMAGE && dcc_off) rsrc = force_dcc_off(ctx, rsrc); return rsrc; } /** * Load the resource descriptor for \p image. */ static void image_fetch_rsrc( @@ -233,39 +238,42 @@ image_fetch_rsrc( * not lead to termination. */ index = si_get_bounded_indirect_index(ctx, &image->Indirect, image->Register.Index, ctx->num_images); index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0), index, ""); } + bool bindless = false; + if (image->Register.File != TGSI_FILE_IMAGE) { /* Bindless descriptors are accessible from a different pair of * user SGPR indices. */ rsrc_ptr = LLVMGetParam(ctx->main_fn, ctx->param_bindless_samplers_and_images); index = lp_build_emit_fetch_src(bld_base, image, TGSI_TYPE_UNSIGNED, 0); /* For simplicity, bindless image descriptors use fixed * 16-dword slots for now. */ index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); + bindless = true; } *rsrc = si_load_image_desc(ctx, rsrc_ptr, index, target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE, - dcc_off); + dcc_off, bindless); } static void image_fetch_coords( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_instruction *inst, unsigned src, LLVMValueRef desc, LLVMValueRef *coords) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMBuilderRef builder = ctx->ac.builder; @@ -1061,20 +1069,29 @@ static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base, } if (reg->Register.File != TGSI_FILE_SAMPLER) { /* Bindless descriptors are accessible from a different pair of * user SGPR indices. */ list = LLVMGetParam(ctx->main_fn, ctx->param_bindless_samplers_and_images); index = lp_build_emit_fetch_src(bld_base, reg, TGSI_TYPE_UNSIGNED, 0); + + /* Since bindless handle arithmetic can contain an unsigned integer + * wraparound and si_load_sampler_desc assumes there isn't any, + * use GEP without "inbounds" (inside ac_build_pointer_add) + * to prevent incorrect code generation and hangs. + */ + index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), ""); + list = ac_build_pointer_add(&ctx->ac, list, index); + index = ctx->i32_0; } if (target == TGSI_TEXTURE_BUFFER) *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER); else *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE); if (samp_ptr) *samp_ptr = NULL; if (fmask_ptr) -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev