[PATCH] D36327: [OpenCL] Allow targets emit optimized pipe functions for power of 2 type sizes

Yaxun Liu via Phabricator via cfe-commits Fri, 04 Aug 2017 09:52:35 -0700

yaxunl created this revision.
Herald added a subscriber: tpr.

Currently Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL 
read_pipe builtin,
with appended type size and alignment arguments, where 2 or 4 indicates the 
original
number of arguments.


For certain targets (e.g. amdgpu), there are optimized version of 
__read_pipe_2/__read_pipe_4
when the type size and alignment has the same power of 2 value. It is desired 
that Clang
emits a different function for these cases.

This patch let Clang emits __read_pipe_2_N for such cases where N is the size 
in bytes of
the type. (N = 1,2,4,8,...,128), so that the target runtime can use an 
optimized version of
read_pipe.

The same with __read_pipe_4, __write_pipe_2 and __wirte_pipe_4.

This optimization is controlled by TargetCodeGenInfo::hasOptimizedPipeBuiltin, 
which returns
false by default. Each target can override this function to turn on this 
optimization.


https://reviews.llvm.org/D36327

Files:
  lib/CodeGen/CGBuiltin.cpp
  lib/CodeGen/TargetInfo.cpp
  lib/CodeGen/TargetInfo.h
  test/CodeGenOpenCL/pipe_builtin.cl

Index: test/CodeGenOpenCL/pipe_builtin.cl
===================================================================
--- test/CodeGenOpenCL/pipe_builtin.cl
+++ test/CodeGenOpenCL/pipe_builtin.cl
@@ -1,73 +1,90 @@
-// RUN: %clang_cc1 -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=CL2.0 -o - %s | FileCheck -check-prefixes=CHECK,NAMD %s
+// RUN: %clang_cc1 -triple amdgcn---amdgizcl -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=CL2.0 -o - %s | FileCheck -check-prefixes=CHECK,AMD %s
 
 // CHECK: %opencl.pipe_t = type opaque
 // CHECK: %opencl.reserve_id_t = type opaque
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
+typedef struct {
+  int x[100];
+} S;
+
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+typedef long long8 __attribute__((ext_vector_type(8)));
+typedef long long16 __attribute__((ext_vector_type(16)));
+
 void test1(read_only pipe int p, global int *ptr) {
-  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // NAMD: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // AMD: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}})
   read_pipe(p, ptr);
-  // CHECK: call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_read_pipe(p, 2);
-  // CHECK: call i32 @__read_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // NAMD: call i32 @__read_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // AMD: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i32* %{{.*}})
   read_pipe(p, rid, 2, ptr);
-  // CHECK: call void @__commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__commit_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t{{.*}}* %{{.*}}, i32 4, i32 4)
   commit_read_pipe(p, rid);
 }
 
 void test2(write_only pipe int p, global int *ptr) {
-  // CHECK: call i32 @__write_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // NAMD: call i32 @__write_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // AMD: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}})
   write_pipe(p, ptr);
-  // CHECK: call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_write_pipe(p, 2);
-  // CHECK: call i32 @__write_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // NAMD: call i32 @__write_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}}, i32 4, i32 4)
+  // AMD: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i32* %{{.*}})
   write_pipe(p, rid, 2, ptr);
-  // CHECK: call void @__commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__commit_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
   commit_write_pipe(p, rid);
 }
 
 void test3(read_only pipe int p, global int *ptr) {
-  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
-  // CHECK: call void @__work_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__work_group_commit_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t{{.*}}* %{{.*}}, i32 4, i32 4)
   work_group_commit_read_pipe(p, rid);
 }
 
 void test4(write_only pipe int p, global int *ptr) {
-  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
-  // CHECK: call void @__work_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__work_group_commit_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t{{.*}}* %{{.*}}, i32 4, i32 4)
   work_group_commit_write_pipe(p, rid);
 }
 
 void test5(read_only pipe int p, global int *ptr) {
-  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
-  // CHECK: call void @__sub_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__sub_group_commit_read_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t{{.*}}* %{{.*}}, i32 4, i32 4)
   sub_group_commit_read_pipe(p, rid);
 }
 
 void test6(write_only pipe int p, global int *ptr) {
-  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
-  // CHECK: call void @__sub_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call void @__sub_group_commit_write_pipe(%opencl.pipe_t{{.*}}* %{{.*}}, %opencl.reserve_id_t{{.*}}* %{{.*}}, i32 4, i32 4)
   sub_group_commit_write_pipe(p, rid);
 }
 
 void test7(write_only pipe int p, global int *ptr) {
-  // CHECK: call i32 @__get_pipe_num_packets(%opencl.pipe_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call i32 @__get_pipe_num_packets(%opencl.pipe_t{{.*}}* %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_num_packets(p);
-  // CHECK: call i32 @__get_pipe_max_packets(%opencl.pipe_t* %{{.*}}, i32 4, i32 4)
+  // CHECK: call i32 @__get_pipe_max_packets(%opencl.pipe_t{{.*}}* %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_max_packets(p);
 }
 
 void test8(read_only pipe int r, write_only pipe int w, global int *ptr) {
   // verify that return type is correctly casted to i1 value
-  // CHECK: %[[R:[0-9]+]] = call i32 @__read_pipe_2
+  // NAMD: %[[R:[0-9]+]] = call i32 @__read_pipe_2
+  // AMD: %[[R:[0-9]+]] = call i32 @__read_pipe_2_4
   // CHECK: icmp ne i32 %[[R]], 0
   if (read_pipe(r, ptr)) *ptr = -1;
-  // CHECK: %[[W:[0-9]+]] = call i32 @__write_pipe_2
+  // NAMD: %[[W:[0-9]+]] = call i32 @__write_pipe_2
+  // AMD: %[[W:[0-9]+]] = call i32 @__write_pipe_2_4
   // CHECK: icmp ne i32 %[[W]], 0
   if (write_pipe(w, ptr)) *ptr = -1;
   // CHECK: %[[N:[0-9]+]] = call i32 @__get_pipe_num_packets
@@ -77,3 +94,33 @@
   // CHECK: icmp ne i32 %[[M]], 0
   if (get_pipe_max_packets(w)) *ptr = -1;
 }
+
+// CHECK-LABEL: @test9
+void test9(read_only pipe char p1, global char *ptr1,
+           read_only pipe short p2, global short *ptr2,
+           read_only pipe int p4, global int *ptr4,
+           read_only pipe long p8, global long *ptr8,
+           read_only pipe long2 p16, global long2 *ptr16,
+           read_only pipe long4 p32, global long4 *ptr32,
+           read_only pipe long8 p64, global long8 *ptr64,
+           read_only pipe long16 p128, global long16 *ptr128,
+           read_only pipe S pu, global S *ptru) {
+  // AMD: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* {{.*}}, i8* %{{.*}})
+  read_pipe(p1, ptr1);
+  // AMD: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* {{.*}}, i16* %{{.*}})
+  read_pipe(p2, ptr2);
+  // AMD: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* {{.*}}, i32* %{{.*}})
+  read_pipe(p4, ptr4);
+  // AMD: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* {{.*}}, i64* %{{.*}})
+  read_pipe(p8, ptr8);
+  // AMD: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}})
+  read_pipe(p16, ptr16);
+  // AMD: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}})
+  read_pipe(p32, ptr32);
+  // AMD: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}})
+  read_pipe(p64, ptr64);
+  // AMD: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}})
+  read_pipe(p128, ptr128);
+  // AMD: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}}, i32 400, i32 4)
+  read_pipe(pu, ptru);
+}
Index: lib/CodeGen/TargetInfo.h
===================================================================
--- lib/CodeGen/TargetInfo.h
+++ lib/CodeGen/TargetInfo.h
@@ -260,6 +260,10 @@
   virtual llvm::Constant *
   performAddrSpaceCast(CodeGenModule &CGM, llvm::Constant *V, unsigned SrcAddr,
                        unsigned DestAddr, llvm::Type *DestTy) const;
+
+  /// Whether the target support optimized read_pipe and write_pipe builtin
+  /// functions when type size and alignment is power of 2.
+  virtual bool hasOptimizedOpenCLPipeBuiltin() const { return false; }
 };
 
 } // namespace CodeGen
Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp
+++ lib/CodeGen/TargetInfo.cpp
@@ -7430,6 +7430,7 @@
   }
   unsigned getGlobalVarAddressSpace(CodeGenModule &CGM,
                                     const VarDecl *D) const override;
+  bool hasOptimizedOpenCLPipeBuiltin() const override { return true; }
 };
 }
 
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -2392,46 +2392,95 @@
     CGOpenCLRuntime OpenCLRT(CGM);
     Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
     Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
+    unsigned Size = cast<llvm::ConstantInt>(PacketSize)->getZExtValue();
+    unsigned Align = cast<llvm::ConstantInt>(PacketAlign)->getZExtValue();
+    bool Opt = Size == Align && isPowerOf2_32(Size) &&
+               getTargetHooks().hasOptimizedOpenCLPipeBuiltin();
 
     // Type of the generic packet parameter.
     unsigned GenericAS =
         getContext().getTargetAddressSpace(LangAS::opencl_generic);
-    llvm::Type *I8PTy = llvm::PointerType::get(
-        llvm::Type::getInt8Ty(getLLVMContext()), GenericAS);
+    llvm::Type *PtrElemTy;
+    if (!Opt)
+      PtrElemTy = llvm::Type::getInt8Ty(getLLVMContext());
+    else if (Size <= 8)
+      PtrElemTy = llvm::Type::getIntNTy(getLLVMContext(), Size * 8);
+    else
+      PtrElemTy = llvm::VectorType::get(
+          llvm::Type::getInt64Ty(getLLVMContext()), Size / 8);
+    llvm::Type *PtrTy = llvm::PointerType::get(PtrElemTy, GenericAS);
 
     // Testing which overloaded version we should generate the call for.
     if (2U == E->getNumArgs()) {
-      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
+      std::string Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
                                                              : "__write_pipe_2";
+      llvm::SmallVector<llvm::Type *, 4> ArgTys;
+      ArgTys.push_back(Arg0->getType());
+      ArgTys.push_back(PtrTy);
+
+      if (Opt) {
+        Name = Name + "_" + std::to_string(Size);
+      } else {
+        ArgTys.push_back(Int32Ty);
+        ArgTys.push_back(Int32Ty);
+      }
+
       // Creating a generic function type to be able to call with any builtin or
       // user defined type.
-      llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
       llvm::FunctionType *FTy = llvm::FunctionType::get(
           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
-      Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
+      Value *BCast = Builder.CreatePointerCast(Arg1, PtrTy);
+
+      llvm::SmallVector<llvm::Value *, 4> Args;
+      Args.push_back(Arg0);
+      Args.push_back(BCast);
+      if (!Opt) {
+        Args.push_back(PacketSize);
+        Args.push_back(PacketAlign);
+      }
+
       return RValue::get(
-          Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name),
-                             {Arg0, BCast, PacketSize, PacketAlign}));
+          Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
     } else {
       assert(4 == E->getNumArgs() &&
              "Illegal number of parameters to pipe function");
-      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
+      std::string Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
                                                              : "__write_pipe_4";
+      llvm::SmallVector<llvm::Type *, 6> ArgTys;
+      ArgTys.push_back(Arg0->getType());
+      ArgTys.push_back(Arg1->getType());
+      ArgTys.push_back(Int32Ty);
+      ArgTys.push_back(PtrTy);
+
+      if (Opt) {
+        Name = Name + "_" + std::to_string(Size);
+      } else {
+        ArgTys.push_back(Int32Ty);
+        ArgTys.push_back(Int32Ty);
+      }
 
-      llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
-                              Int32Ty, Int32Ty};
       Value *Arg2 = EmitScalarExpr(E->getArg(2)),
             *Arg3 = EmitScalarExpr(E->getArg(3));
       llvm::FunctionType *FTy = llvm::FunctionType::get(
           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
-      Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
+      Value *BCast = Builder.CreatePointerCast(Arg3, PtrTy);
       // We know the third argument is an integer type, but we may need to cast
       // it to i32.
       if (Arg2->getType() != Int32Ty)
         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
-      return RValue::get(Builder.CreateCall(
-          CGM.CreateRuntimeFunction(FTy, Name),
-          {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
+
+      llvm::SmallVector<llvm::Value *, 6> Args;
+      Args.push_back(Arg0);
+      Args.push_back(Arg1);
+      Args.push_back(Arg2);
+      Args.push_back(BCast);
+      if (!Opt) {
+        Args.push_back(PacketSize);
+        Args.push_back(PacketAlign);
+      }
+
+      return RValue::get(
+          Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
     }
   }
   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D36327: [OpenCL] Allow targets emit optimized pipe functions for power of 2 type sizes

Reply via email to