+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op
op)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+
ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+
ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity),
"");
+ result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf,
false);
+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
unsigned cluster_size)
+{
+ if (cluster_size == 1) return src;
+ swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+ swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror,
0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result,
ds_pattern_bitmode(0x1f, 0, 0x04));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf,
0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result,
ds_pattern_bitmode(0x1f, 0, 0x08));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI && cluster_size != 32)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15,
0xa, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result,
ds_pattern_bitmode(0x1f, 0, 0x10));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI) {
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31,
0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32,
63, 0));
+ return ac_build_wwm(ctx, result);
+ } else {
+ swap = ac_build_readlane(ctx, result, ctx->i32_0);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32,
32, 0));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ return ac_build_wwm(ctx, result);
+ }
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+ if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+ return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+ } else {
+ return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
+ }
+}
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef
index)
+{
+ index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0),
"");
+ return ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ (LLVMValueRef []) {index, src}, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 8b35028a31..44811ce6d9 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -27,7 +27,7 @@
#include <stdbool.h>
#include <llvm-c/TargetMachine.h>
-
+#include <nir/nir.h>
#include "amd_family.h"
#ifdef __cplusplus
@@ -417,6 +417,34 @@ LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx,
LLVMValueRef param,
void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
LLVMValueRef *addr, bool is_array_tex);
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned
mask);
+
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef
lane);
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef
value, LLVMValueRef lane);
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op
op);
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op
op);
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
unsigned cluster_size);
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3);
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef
index);
+
#ifdef __cplusplus
}
#endif