From: Connor Abbott <cwabbo...@gmail.com> Using the new WWM and DPP intrinsics introduced in LLVM 6.0. This adds everything needed to implement SPV_AMD_shader_ballot, including the Groups capability, to ac_llvm_build.c. That way, it can be shared by a potential GL_AMD_shader_ballot implementation in the future. Currently, the implementation only uses the DPP instructions that are available on VI+, so SI and CI won't be able to use the extension, but it should be possible (albeit a little tricky) to use ds_swizzle to get support for SI and CI. --- src/amd/common/ac_llvm_build.c | 703 +++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_llvm_build.h | 115 +++++++ 2 files changed, 818 insertions(+)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index d4b48d1..c75bf00 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -347,6 +347,709 @@ ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value) return LLVMBuildOr(ctx->builder, all, none, ""); } +LLVMValueRef ac_reduce_iadd(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildAdd(ctx->builder, lhs, rhs, ""); +} + +LLVMValueRef ac_reduce_fadd(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildFAdd(ctx->builder, lhs, rhs, ""); +} + +LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + char name[32], type[8]; + ac_build_type_name_for_intr(LLVMTypeOf(lhs), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.minnum.%s", type); + return ac_build_intrinsic(ctx, name, LLVMTypeOf(lhs), + (LLVMValueRef []) { lhs, rhs }, 2, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_reduce_fmax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + char name[32], type[8]; + ac_build_type_name_for_intr(LLVMTypeOf(lhs), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.maxnum.%s", type); + return ac_build_intrinsic(ctx, name, LLVMTypeOf(lhs), + (LLVMValueRef []) { lhs, rhs }, 2, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef ac_reduce_imin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntSLT, + lhs, rhs, ""), + lhs, rhs, ""); +} + +LLVMValueRef ac_reduce_imax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntSGT, + lhs, rhs, ""), + lhs, rhs, ""); +} + +LLVMValueRef ac_reduce_umin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntULT, + lhs, rhs, ""), + lhs, rhs, ""); +} + +LLVMValueRef ac_reduce_umax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs) +{ + return LLVMBuildSelect(ctx->builder, + LLVMBuildICmp(ctx->builder, LLVMIntUGT, + lhs, rhs, ""), + lhs, rhs, ""); +} + +enum dpp_ctrl { + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 +}; + +static inline enum dpp_ctrl +dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6); +} + +static inline enum dpp_ctrl +dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sl | amount; +} + +static inline enum dpp_ctrl +dpp_row_sr(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return _dpp_row_sr | amount; +} + +static LLVMValueRef +_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32", + LLVMTypeOf(old), (LLVMValueRef[]) { + old, src, + LLVMConstInt(ctx->i32, dpp_ctrl, 0), + LLVMConstInt(ctx->i32, row_mask, 0), + LLVMConstInt(ctx->i32, bank_mask, 0), + LLVMConstInt(ctx->i1, bound_ctrl, 0) }, + 6, AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +static LLVMValueRef +ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src, + enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask, + bool bound_ctrl) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + old = ac_to_integer(ctx, old); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits == 32) { + ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, + bank_mask, bound_ctrl); + } else { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + LLVMValueRef old_vector = + LLVMBuildBitCast(ctx->builder, old, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + old = LLVMBuildExtractElement(ctx->builder, old_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src, + dpp_ctrl, + row_mask, + bank_mask, + bound_ctrl); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.readlane", + LLVMTypeOf(src), (LLVMValueRef []) { + src, lane }, + 2, AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +static LLVMValueRef +ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef lane) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits == 32) { + ret = _ac_build_readlane(ctx, src, lane); + } else { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, + lane); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned mask) +{ + return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", + LLVMTypeOf(src), (LLVMValueRef []) { + src, LLVMConstInt(ctx->i32, mask, 0) }, + 2, AC_FUNC_ATTR_NOUNWIND | + AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); +} + +static LLVMValueRef +ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned mask) +{ + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src)); + LLVMValueRef ret; + if (bits == 32) { + ret = _ac_build_ds_swizzle(ctx, src, mask); + } else { + assert(bits % 32 == 0); + LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32); + LLVMValueRef src_vector = + LLVMBuildBitCast(ctx->builder, src, vec_type, ""); + ret = LLVMGetUndef(vec_type); + for (unsigned i = 0; i < bits / 32; i++) { + src = LLVMBuildExtractElement(ctx->builder, src_vector, + LLVMConstInt(ctx->i32, i, + 0), ""); + LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, + mask); + ret = LLVMBuildInsertElement(ctx->builder, ret, + ret_comp, + LLVMConstInt(ctx->i32, i, + 0), ""); + } + } + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef inactive) +{ + char name[32], type[8]; + LLVMTypeRef src_type = LLVMTypeOf(src); + src = ac_to_integer(ctx, src); + inactive = ac_to_integer(ctx, inactive); + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type); + LLVMValueRef ret = + ac_build_intrinsic(ctx, name, + LLVMTypeOf(src), (LLVMValueRef []) { + src, inactive }, 2, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | + AC_FUNC_ATTR_CONVERGENT); + return LLVMBuildBitCast(ctx->builder, ret, src_type, ""); +} + +static LLVMValueRef +ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src) +{ + char name[32], type[8]; + ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type)); + snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type); + return ac_build_intrinsic(ctx, name, LLVMTypeOf(src), + (LLVMValueRef []) { src }, 1, + AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE); +} + +LLVMValueRef +ac_build_subgroup_inclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef src, + ac_reduce_op reduce, + LLVMValueRef identity) +{ + /* See http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/ + * + * Note that each dpp/reduce pair is supposed to be compiled down to + * one instruction by LLVM, at least for 32-bit values. + * + * TODO: use @llvm.amdgcn.ds.swizzle on SI and CI + */ + LLVMValueRef value = src; + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, src, + dpp_row_sr(1), 0xf, 0xf, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, src, + dpp_row_sr(2), 0xf, 0xf, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, src, + dpp_row_sr(3), 0xf, 0xf, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, value, + dpp_row_sr(4), 0xf, 0xe, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, value, + dpp_row_sr(8), 0xf, 0xc, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, value, + dpp_row_bcast15, 0xa, 0xf, false)); + value = reduce(ctx, value, + ac_build_dpp(ctx, identity, value, + dpp_row_bcast31, 0xc, 0xf, false)); + return value; +} + +LLVMValueRef +ac_build_subgroup_inclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity) +{ + ac_build_optimization_barrier(ctx, &value); + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + return ac_build_wwm(ctx, value); +} + + +LLVMValueRef +ac_build_subgroup_reduce(struct ac_llvm_context *ctx, LLVMValueRef value, + ac_reduce_op reduce, LLVMValueRef identity) +{ + + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + value = ac_build_readlane(ctx, value, LLVMConstInt(ctx->i32, 63, 0)); + return ac_build_wwm(ctx, value); +} + +LLVMValueRef +ac_build_subgroup_reduce_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity) +{ + ac_build_optimization_barrier(ctx, &value); + return ac_build_subgroup_reduce(ctx, value, reduce, identity); +} + +LLVMValueRef +ac_build_subgroup_exclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity) +{ + value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf, false); + return ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); +} + +LLVMValueRef +ac_build_subgroup_exclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity) +{ + ac_build_optimization_barrier(ctx, &value); + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_exclusive_scan(ctx, value, reduce, identity); + return ac_build_wwm(ctx, value); +} + +LLVMValueRef +ac_build_swizzle_quad(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned swizzle_mask) +{ + ac_build_optimization_barrier(ctx, &src); + /* TODO: use @llvm.amdgcn.ds.swizzle on SI and CI */ + return ac_build_dpp(ctx, LLVMGetUndef(LLVMTypeOf(src)), src, + dpp_quad_perm(swizzle_mask & 0x3, + (swizzle_mask >> 2) & 0x3, + (swizzle_mask >> 4) & 0x3, + (swizzle_mask >> 6) & 0x3), + 0xf, 0xf, /*bound_ctrl:0*/ true); +} + +LLVMValueRef +ac_build_swizzle_masked(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned swizzle_mask) +{ + ac_build_optimization_barrier(ctx, &src); + /* TODO: For some special mask values, we could use DPP instead on VI+. + * We might be able to use DPP entirely, but it would be a little + * tricky. + */ + return ac_build_ds_swizzle(ctx, src, swizzle_mask); +} + +LLVMValueRef +ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef write, LLVMValueRef lane) +{ + /* TODO: Use the actual instruction when LLVM adds an intrinsic for it. + */ + LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane, + ac_get_thread_id(ctx), ""); + return LLVMBuildSelect(ctx->builder, pred, write, src, ""); +} + +LLVMValueRef +ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask) +{ + LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, + LLVMVectorType(ctx->i32, 2), + ""); + LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_0, ""); + LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, + ctx->i32_1, ""); + LLVMValueRef val = + ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, + (LLVMValueRef []) { mask_lo, ctx->i32_0 }, + 2, AC_FUNC_ATTR_READNONE); + val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, + (LLVMValueRef []) { mask_hi, val }, + 2, AC_FUNC_ATTR_READNONE); + return val; +} + +/* return true for exactly one thread in the subgroup/wavefront */ + +static LLVMValueRef +ac_build_subgroup_elect(struct ac_llvm_context *ctx) +{ + LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1); + /* mbcnt(EXEC) returns the number of active threads with ID less than + * ours, so the lowest thread will return 0. + */ + LLVMValueRef active_tid = ac_build_mbcnt(ctx, active_set); + return LLVMBuildICmp(ctx->builder, LLVMIntEQ, active_tid, ctx->i32_0, + ""); +} + +static LLVMValueRef +ac_build_subgroup_elect_uniform(struct ac_llvm_context *ctx) +{ + return LLVMBuildICmp(ctx->builder, LLVMIntEQ, ac_get_thread_id(ctx), + ctx->i32_0, ""); +} + +#define LOCAL_ADDR_SPACE 3 + +static LLVMValueRef +get_shared_temp(struct ac_llvm_context *ctx, + LLVMTypeRef type, + unsigned max_workgroup_size) +{ + /* TODO only make one variable and share it */ + return LLVMAddGlobalInAddressSpace( + ctx->module, + LLVMArrayType(type, DIV_ROUND_UP(max_workgroup_size, 64)), + "reduce_temp", LOCAL_ADDR_SPACE); +} + +/* given an array of values, emit code to reduce them to a single value using a + * given operator. Note that this isn't cross-thread at all; it's just normal + * LLVM code. + */ +static LLVMValueRef +reduce_array(struct ac_llvm_context *ctx, LLVMValueRef array, + ac_reduce_op reduce) +{ + unsigned size = LLVMGetArrayLength(LLVMTypeOf(array)); + assert(size > 0); + if (size == 1) + return LLVMBuildExtractValue(ctx->builder, array, 0, ""); + + LLVMTypeRef elem_type = LLVMGetElementType(LLVMTypeOf(array)); + + unsigned left_size = size / 2; + LLVMValueRef left = LLVMGetUndef(LLVMArrayType(elem_type, left_size)); + for (unsigned i = 0; i < left_size; i++) { + LLVMValueRef val = LLVMBuildExtractValue(ctx->builder, array, + i, ""); + left = LLVMBuildInsertValue(ctx->builder, left, val, i, ""); + } + left = reduce_array(ctx, left, reduce); + + unsigned right_size = size - left_size; + LLVMValueRef right = LLVMGetUndef(LLVMArrayType(elem_type, right_size)); + for (unsigned i = 0; i < right_size; i++) { + LLVMValueRef val = LLVMBuildExtractValue(ctx->builder, array, + i + left_size, ""); + right = LLVMBuildInsertValue(ctx->builder, right, val, i, ""); + } + right = reduce_array(ctx, right, reduce); + + return reduce(ctx, left, right); +} + +static LLVMValueRef +_ac_build_group_reduce(struct ac_llvm_context *ctx, + LLVMValueRef value, ac_reduce_op reduce, + LLVMValueRef identity, bool exclusive_scan, + bool uniform, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + if (max_workgroup_size <= 64) { + if (exclusive_scan) + return identity; + else + return value; + } + + /* Allocate some temporary storage, one value for each wavefront. */ + LLVMValueRef shared = get_shared_temp(ctx, LLVMTypeOf(value), + max_workgroup_size); + + LLVMValueRef func = + LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder)); + LLVMBasicBlockRef if_block = + LLVMAppendBasicBlockInContext(ctx->context, func, ""); + LLVMBasicBlockRef merge_block = + LLVMAppendBasicBlockInContext(ctx->context, func, ""); + + /* gather the subgroup-reduced values from each buffer into shared */ + + LLVMBuildCondBr(ctx->builder, + (uniform ? ac_build_subgroup_elect_uniform : + ac_build_subgroup_elect)(ctx), + if_block, merge_block); + /* if (subgroup_elect()) */ + { + LLVMPositionBuilderAtEnd(ctx->builder, if_block); + LLVMValueRef ptr = ac_build_gep0(ctx, shared, wavefront_id); + LLVMBuildStore(ctx->builder, value, ptr); + LLVMBuildBr(ctx->builder, merge_block); + } + + LLVMPositionBuilderAtEnd(ctx->builder, merge_block); + + ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, + AC_FUNC_ATTR_CONVERGENT); + + /* For each wavefront, load every other wavefront's values from the + * previous stage. + */ + LLVMValueRef array = LLVMBuildLoad(ctx->builder, shared, ""); + + if (exclusive_scan) { + /* mask out values from wavefronts greater than or equal to + * ours, to implement exclusive scan + */ + for (unsigned i = 0; 64 * i < max_workgroup_size; i++) { + LLVMValueRef wf_value = + LLVMBuildExtractValue(ctx->builder, array, i, + ""); + LLVMValueRef pred = + LLVMBuildICmp(ctx->builder, LLVMIntULT, + LLVMConstInt(ctx->i32, i, 0), + wavefront_id, + ""); + wf_value = LLVMBuildSelect(ctx->builder, pred, + wf_value, identity, ""); + array = LLVMBuildInsertValue(ctx->builder, array, + wf_value, i, ""); + } + } + + /* finally, manually reduce the values from each wavefront without any + * cross-thread tricks. + */ + return reduce_array(ctx, array, reduce); +} + +LLVMValueRef +ac_build_group_reduce(struct ac_llvm_context *ctx, + LLVMValueRef value, ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + value = ac_build_subgroup_reduce(ctx, value, reduce, identity); + return _ac_build_group_reduce(ctx, value, reduce, identity, false, + true, max_workgroup_size, wavefront_id); +} + +LLVMValueRef +ac_build_group_reduce_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + value = ac_build_subgroup_reduce_nonuniform(ctx, value, reduce, + identity); + return _ac_build_group_reduce(ctx, value, reduce, identity, false, + false, max_workgroup_size, wavefront_id); +} + +LLVMValueRef +ac_build_group_exclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + /* Do the exclusive scan per-wavefront, and at the same time calculate + * the fully-reduced value for doing the overall exclusive scan. + */ + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + LLVMValueRef reduced = ac_build_readlane(ctx, value, + LLVMConstInt(ctx->i32, 63, + 0)); + value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf, + false); + reduced = ac_build_wwm(ctx, reduced); + value = ac_build_wwm(ctx, value); + reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true, + true, max_workgroup_size, + wavefront_id); + return reduce(ctx, value, reduced); +} + +LLVMValueRef +ac_build_group_exclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + ac_build_optimization_barrier(ctx, &value); + /* Do the exclusive scan per-wavefront, and at the same time calculate + * the fully-reduced value for doing the overall exclusive scan. + */ + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + LLVMValueRef reduced = ac_build_readlane(ctx, value, + LLVMConstInt(ctx->i32, 63, + 0)); + value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf, + false); + reduced = ac_build_wwm(ctx, reduced); + value = ac_build_wwm(ctx, value); + reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true, + false, max_workgroup_size, + wavefront_id); + return reduce(ctx, value, reduced); +} + +LLVMValueRef +ac_build_group_inclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + /* Do the inclusive scan per-wavefront, and at the same time calculate + * the fully-reduced value for doing the overall exclusive scan. + */ + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + LLVMValueRef reduced = ac_build_readlane(ctx, value, + LLVMConstInt(ctx->i32, 63, + 0)); + reduced = ac_build_wwm(ctx, reduced); + value = ac_build_wwm(ctx, value); + reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true, + true, max_workgroup_size, + wavefront_id); + return reduce(ctx, value, reduced); +} + +LLVMValueRef +ac_build_group_inclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id) +{ + ac_build_optimization_barrier(ctx, &value); + /* Do the inclusive scan per-wavefront, and at the same time calculate + * the fully-reduced value for doing the overall exclusive scan. + */ + value = ac_build_set_inactive(ctx, value, identity); + value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity); + LLVMValueRef reduced = ac_build_readlane(ctx, value, + LLVMConstInt(ctx->i32, 63, + 0)); + reduced = ac_build_wwm(ctx, reduced); + value = ac_build_wwm(ctx, value); + reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true, + false, max_workgroup_size, + wavefront_id); + return reduce(ctx, value, reduced); +} + LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index 1d9850b..463f3a9 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -84,6 +84,19 @@ void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr); +LLVMValueRef +ac_build_swizzle_quad(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned swizzle_mask); + +LLVMValueRef +ac_build_swizzle_masked(struct ac_llvm_context *ctx, LLVMValueRef src, + unsigned swizzle_mask); + +LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, + LLVMValueRef write, LLVMValueRef lane); + +LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask); + LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value); LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value); @@ -92,6 +105,108 @@ LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value); LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value); +typedef LLVMValueRef (*ac_reduce_op)(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_iadd(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_fadd(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_fmax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_imax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_umax(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_imin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_reduce_umin(struct ac_llvm_context *ctx, LLVMValueRef lhs, + LLVMValueRef rhs); + +LLVMValueRef ac_build_subgroup_reduce(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_subgroup_inclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_subgroup_exclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_subgroup_reduce_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_subgroup_inclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_subgroup_exclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity); + +LLVMValueRef ac_build_group_reduce(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + +LLVMValueRef ac_build_group_inclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + +LLVMValueRef ac_build_group_exclusive_scan(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + +LLVMValueRef ac_build_group_reduce_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + +LLVMValueRef ac_build_group_inclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + +LLVMValueRef ac_build_group_exclusive_scan_nonuniform(struct ac_llvm_context *ctx, + LLVMValueRef value, + ac_reduce_op reduce, + LLVMValueRef identity, + unsigned max_workgroup_size, + LLVMValueRef wavefront_id); + LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values, -- 2.9.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev