arsenm updated this revision to Diff 493566.
arsenm added a comment.

Revert implicitarg.ptr changes since not-HSA has different alignment for no 
reason. Also with the size differences between amdhsa and different CO versions 
we're already wrong for emitting 256 unconditionally


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D142823/new/

https://reviews.llvm.org/D142823

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
  clang/test/CodeGenCUDA/builtins-amdgcn.cu
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
  llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
  llvm/utils/TableGen/CodeGenIntrinsics.h
  llvm/utils/TableGen/CodeGenTarget.cpp
  llvm/utils/TableGen/IntrinsicEmitter.cpp

Index: llvm/utils/TableGen/IntrinsicEmitter.cpp
===================================================================
--- llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -726,6 +726,10 @@
           OS << "      Attribute::get(C, Attribute::Alignment, "
              << Attr.Value << "),\n";
           break;
+        case CodeGenIntrinsic::Dereferenceable:
+          OS << "      Attribute::get(C, Attribute::Dereferenceable, "
+             << Attr.Value << "),\n";
+          break;
         }
       }
       OS << "    });\n";
Index: llvm/utils/TableGen/CodeGenTarget.cpp
===================================================================
--- llvm/utils/TableGen/CodeGenTarget.cpp
+++ llvm/utils/TableGen/CodeGenTarget.cpp
@@ -923,6 +923,10 @@
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     uint64_t Align = R->getValueAsInt("Align");
     addArgAttribute(ArgNo, Alignment, Align);
+  } else if (R->isSubClassOf("Dereferenceable")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    uint64_t Bytes = R->getValueAsInt("Bytes");
+    addArgAttribute(ArgNo, Dereferenceable, Bytes);
   } else
     llvm_unreachable("Unknown property!");
 }
Index: llvm/utils/TableGen/CodeGenIntrinsics.h
===================================================================
--- llvm/utils/TableGen/CodeGenIntrinsics.h
+++ llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -119,7 +119,8 @@
     WriteOnly,
     ReadNone,
     ImmArg,
-    Alignment
+    Alignment,
+    Dereferenceable
   };
 
   struct ArgAttribute {
Index: llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
===================================================================
--- llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -1012,8 +1012,8 @@
 
   GISelKnownBits Info(*MF);
 
-  EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyDispatchPtr));
-  EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyQueuePtr));
+  EXPECT_EQ(Align(8), Info.computeKnownAlignment(CopyDispatchPtr));
+  EXPECT_EQ(Align(8), Info.computeKnownAlignment(CopyQueuePtr));
   EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyKernargSegmentPtr));
   EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyImplicitArgPtr));
   EXPECT_EQ(Align(4), Info.computeKnownAlignment(CopyImplicitBufferPtr));
Index: llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -47,7 +47,7 @@
 ; GCN-LABEL: @get_local_size_z(
 ; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
-; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4
+; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 8
 ; GCN-NEXT:    store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
@@ -139,7 +139,7 @@
 ; GCN-LABEL: @get_work_group_size_z(
 ; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; GCN-NEXT:    [[GEP_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
-; GCN-NEXT:    [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4
+; GCN-NEXT:    [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 8
 ; GCN-NEXT:    store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -141,8 +141,10 @@
                                <"__builtin_amdgcn_workgroup_id">;
 
 def int_amdgcn_dispatch_ptr :
+  ClangBuiltin<"__builtin_amdgcn_dispatch_ptr">,
   DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, Dereferenceable<RetIndex, 64>, IntrNoMem,
+   IntrSpeculatable]>;
 
 def int_amdgcn_queue_ptr :
   ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
@@ -154,6 +156,8 @@
   DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
+// TODO: This is 8 for amdhsa. For others it's 4 for no real reason.
+// This should also be dereferenceable(256) for amdhsa COV5.
 def int_amdgcn_implicitarg_ptr :
   ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
   DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -94,6 +94,11 @@
   int Align = align;
 }
 
+class Dereferenceable<AttrIndex idx, int bytes> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+  int Bytes = bytes;
+}
+
 // Returned - The specified argument is always the return value of the
 // intrinsic.
 class Returned<AttrIndex idx> : IntrinsicProperty {
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -516,12 +516,15 @@
 }
 
 // CHECK-LABEL: @test_dispatch_ptr
-// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 void test_dispatch_ptr(__constant unsigned char ** out)
 {
   *out = __builtin_amdgcn_dispatch_ptr();
 }
 
+// CHECK: declare align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+
+
 // CHECK-LABEL: @test_queue_ptr
 // CHECK: call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
 void test_queue_ptr(__constant unsigned char ** out)
@@ -543,6 +546,9 @@
   *out = __builtin_amdgcn_implicitarg_ptr();
 }
 
+// CHECK: declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+
+
 // CHECK-LABEL: @test_get_group_id(
 // CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x()
 // CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y()
@@ -583,7 +589,7 @@
 }
 
 // CHECK-LABEL: @test_get_workgroup_size(
-// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 4
 // CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 6
@@ -601,7 +607,7 @@
 }
 
 // CHECK-LABEL: @test_get_grid_size(
-// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 12
 // CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load
 // CHECK: getelementptr i8, ptr addrspace(4) %{{.*}}, i64 16
Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu
===================================================================
--- clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -21,7 +21,7 @@
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
 // CHECK-NEXT:    store ptr [[TMP2]], ptr [[DISPATCH_PTR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8
@@ -154,7 +154,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[X:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
 // CHECK-NEXT:    store ptr [[TMP1]], ptr [[X_ASCAST]], align 8
 // CHECK-NEXT:    ret void
Index: clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
===================================================================
--- clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
 // RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
-// RUN:     | FileCheck -check-prefix=PRECOV5 %s
+// RUN:     | FileCheck -check-prefixes=PRECOV5,CHECK %s
 
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
 // RUN:     -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
-// RUN:     | FileCheck -check-prefix=COV5 %s
+// RUN:     | FileCheck -check-prefixes=COV5,CHECK %s
 
 #include "Inputs/cuda.h"
 
 // PRECOV5-LABEL: test_get_workgroup_size
-// PRECOV5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// PRECOV5: call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 // PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
 // PRECOV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 // PRECOV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
@@ -36,4 +36,7 @@
   }
 }
 
+// COV4: declare align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// COV5: declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -16756,16 +16756,9 @@
                              const CallExpr *E = nullptr) {
   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
   auto *Call = CGF.Builder.CreateCall(F);
-  Call->addRetAttr(
-      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
-  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
   if (!E)
     return Call;
-  QualType BuiltinRetType = E->getType();
-  auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
-  if (RetTy == Call->getType())
-    return Call;
-  return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
+  return CGF.Builder.CreateAddrSpaceCast(Call, CGF.ConvertType(E->getType()));
 }
 
 Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to