On 10.06.2017 21:52, Bas Nieuwenhuizen wrote:
Slightly faster than bpermute, and seems supported since at least
LLVM 3.9.

Signed-off-by: Bas Nieuwenhuizen <ba...@google.com>
---
  src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++-------------
  1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 237e9291d41..62a00f214de 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
              LLVMValueRef lds,
              LLVMValueRef val)
  {
-       LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+       LLVMValueRef thread_id, tl, trbl, args[5];
        LLVMValueRef result;
- thread_id = ac_get_thread_id(ctx);
-
-       tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-                             LLVMConstInt(ctx->i32, mask, false), "");
+       /* bpermute is VI+, mov_dpp is VI+ too */
+       if (has_ds_bpermute) {
+               uint32_t tl_ctrl = 0, trbl_ctrl = 0;
- trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-                               LLVMConstInt(ctx->i32, idx, false), "");
+               for (unsigned i = 0; i < 4; ++i) {
+                       tl_ctrl |= (i & mask) << (2 * i);
+                       trbl_ctrl |= ((i & mask) + idx) << (2 * i);
+               }
- if (has_ds_bpermute) {
-               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               args[1] = val;
+               args[0] = val;
+               args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
+               args[2] = LLVMConstInt(ctx->i32, 0xf, false);
+               args[3] = LLVMConstInt(ctx->i32, 0xf, false);
+               args[4] = LLVMConstInt(ctx->i1, 1, false);
                tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                       args, 2,
+                                       "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+                                       args, 5,
                                        AC_FUNC_ATTR_READNONE |
                                        AC_FUNC_ATTR_CONVERGENT);
- args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
+               args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
                trbl = ac_build_intrinsic(ctx,
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                         args, 2,
+                                         "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+                                         args, 5,
                                          AC_FUNC_ATTR_READNONE |
                                          AC_FUNC_ATTR_CONVERGENT);
        } else {
-               LLVMValueRef store_ptr, load_ptr0, load_ptr1;
+               LLVMValueRef tl_tid, trbl_tid;
+
+               thread_id = ac_get_thread_id(ctx);
+
+               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+                               LLVMConstInt(ctx->i32, mask, false), "");
+
+               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+                                       LLVMConstInt(ctx->i32, idx, false), "");
+
+               if (has_ds_bpermute) {

This is dead now...

Apart from this, the new code looks good. Does LLVM already optimize that down to two VALU instructions by pulling the DPP into the v_add?

With the dead code removed:

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>


+                       args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+                                       LLVMConstInt(ctx->i32, 4, false), "");
+                       args[1] = val;
+                       tl = ac_build_intrinsic(ctx,
+                                               "llvm.amdgcn.ds.bpermute", 
ctx->i32,
+                                               args, 2,
+                                               AC_FUNC_ATTR_READNONE |
+                                               AC_FUNC_ATTR_CONVERGENT);
+
+                       args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+                                       LLVMConstInt(ctx->i32, 4, false), "");
+                       trbl = ac_build_intrinsic(ctx,
+                                               "llvm.amdgcn.ds.bpermute", 
ctx->i32,
+                                               args, 2,
+                                               AC_FUNC_ATTR_READNONE |
+                                               AC_FUNC_ATTR_CONVERGENT);
+               } else {
+                       LLVMValueRef store_ptr, load_ptr0, load_ptr1;
- store_ptr = ac_build_gep0(ctx, lds, thread_id);
-               load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
-               load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
+                       store_ptr = ac_build_gep0(ctx, lds, thread_id);
+                       load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
+                       load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
- LLVMBuildStore(ctx->builder, val, store_ptr);
-               tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
-               trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+                       LLVMBuildStore(ctx->builder, val, store_ptr);
+                       tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
+                       trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+               }
        }
tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to