JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, tianshilei1992, jhuber6, ye-luo, 
ronlieb, pdhaliwal.
Herald added subscribers: asavonic, kerbowa, guansong, tpr, yaxunl, mgorny, 
nhaehnle, jvesely.
JonChesterfield requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

Extension of D112504 <https://reviews.llvm.org/D112504>. Lower amdgpu printf to 
`__llvm_omp_vprintf`
which takes the same const char*, void* arguments as cuda vprintf and also
passes the size of the void* alloca which will be needed by a non-stub
implementation of `__llvm_omp_vprintf` for amdgpu.

This removes the amdgpu link error on any printf in a target region in favour
of silently compiling code that doesn't print anything to stdout.

The exact set of changes to check-openmp probably needs revision before commit


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D112680

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGGPUBuiltin.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  openmp/libomptarget/DeviceRTL/include/Debug.h
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Debug.cpp
  openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
  openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
  openmp/libomptarget/test/mapping/data_member_ref.cpp
  openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
  openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
  openmp/libomptarget/test/mapping/lambda_by_value.cpp
  openmp/libomptarget/test/mapping/ompx_hold/struct.c
  openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
  openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
  openmp/libomptarget/test/offloading/bug49021.cpp
  openmp/libomptarget/test/offloading/bug50022.cpp
  openmp/libomptarget/test/offloading/host_as_target.c
  openmp/libomptarget/test/unified_shared_memory/api.c
  openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
  openmp/libomptarget/test/unified_shared_memory/close_modifier.c
  openmp/libomptarget/test/unified_shared_memory/shared_update.c

Index: openmp/libomptarget/test/unified_shared_memory/shared_update.c
===================================================================
--- openmp/libomptarget/test/unified_shared_memory/shared_update.c
+++ openmp/libomptarget/test/unified_shared_memory/shared_update.c
@@ -2,8 +2,9 @@
 
 // REQUIRES: unified_shared_memory
 
-// amdgcn does not have printf definition
-// XFAIL: amdgcn-amd-amdhsa
+// amdgpu runtime crash
+// UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 #include <omp.h>
Index: openmp/libomptarget/test/unified_shared_memory/close_modifier.c
===================================================================
--- openmp/libomptarget/test/unified_shared_memory/close_modifier.c
+++ openmp/libomptarget/test/unified_shared_memory/close_modifier.c
@@ -3,8 +3,9 @@
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
 
-// amdgcn does not have printf definition
-// XFAIL: amdgcn-amd-amdhsa
+// amdgpu runtime crash
+// UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <omp.h>
 #include <stdio.h>
Index: openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
===================================================================
--- openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@@ -3,7 +3,7 @@
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
 
-// Fails on amdgcn with error: GPU Memory Error
+// Fails on amdgpu with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <omp.h>
Index: openmp/libomptarget/test/unified_shared_memory/api.c
===================================================================
--- openmp/libomptarget/test/unified_shared_memory/api.c
+++ openmp/libomptarget/test/unified_shared_memory/api.c
@@ -2,7 +2,7 @@
 // XFAIL: nvptx64-nvidia-cuda
 // XFAIL: nvptx64-nvidia-cuda-newRTL
 
-// Fails on amdgcn with error: GPU Memory Error
+// Fails on amdgpu with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <stdio.h>
Index: openmp/libomptarget/test/offloading/host_as_target.c
===================================================================
--- openmp/libomptarget/test/offloading/host_as_target.c
+++ openmp/libomptarget/test/offloading/host_as_target.c
@@ -7,7 +7,7 @@
 
 // RUN: %libomptarget-compile-run-and-check-generic
 
-// amdgcn does not have printf definition
+// amdgpu does not have a working printf definition
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <stdio.h>
Index: openmp/libomptarget/test/offloading/bug50022.cpp
===================================================================
--- openmp/libomptarget/test/offloading/bug50022.cpp
+++ openmp/libomptarget/test/offloading/bug50022.cpp
@@ -1,7 +1,5 @@
 // RUN: %libomptarget-compilexx-and-run-generic
 
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
 #include <cassert>
 #include <iostream>
 #include <stdexcept>
Index: openmp/libomptarget/test/offloading/bug49021.cpp
===================================================================
--- openmp/libomptarget/test/offloading/bug49021.cpp
+++ openmp/libomptarget/test/offloading/bug49021.cpp
@@ -1,7 +1,7 @@
 // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
 
-// Wrong results on amdgcn
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// Wrong results on amdgpu
+// XFAIL: amdgcn-amd-amdhsa
 
 #include <iostream>
 
Index: openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
===================================================================
--- openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
+++ openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
@@ -1,7 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// amdgcn does not have printf definition
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// Wrong results on amdgpu
+// XFAIL: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 
Index: openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
===================================================================
--- openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
+++ openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
@@ -1,7 +1,7 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 
-// amdgcn does not have printf definition
-// XFAIL: amdgcn-amd-amdhsa
+// Racy, at least on the new runtime on amdgcn. Sometime returns 77
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 
 #include <stdio.h>
 
Index: openmp/libomptarget/test/mapping/ompx_hold/struct.c
===================================================================
--- openmp/libomptarget/test/mapping/ompx_hold/struct.c
+++ openmp/libomptarget/test/mapping/ompx_hold/struct.c
@@ -1,7 +1,7 @@
 // RUN: %libomptarget-compile-generic -fopenmp-extensions
 // RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace
 
-// amdgcn does not have printf definition
+// Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <omp.h>
Index: openmp/libomptarget/test/mapping/lambda_by_value.cpp
===================================================================
--- openmp/libomptarget/test/mapping/lambda_by_value.cpp
+++ openmp/libomptarget/test/mapping/lambda_by_value.cpp
@@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// amdgcn does not have printf definition
+// Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <stdio.h>
Index: openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
===================================================================
--- openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+++ openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
@@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// amdgcn does not have printf definition
+// Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <cstdio>
Index: openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
===================================================================
--- openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
@@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// amdgcn does not have printf definition
+// Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <cstdio>
Index: openmp/libomptarget/test/mapping/data_member_ref.cpp
===================================================================
--- openmp/libomptarget/test/mapping/data_member_ref.cpp
+++ openmp/libomptarget/test/mapping/data_member_ref.cpp
@@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
-// amdgcn does not have printf definition
+// Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
 
 #include <stdio.h>
Index: openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
===================================================================
--- openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -121,4 +121,4 @@
 endif()
 
 # Report to the parent scope that we are building a plugin for amdgpu
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa amdgcn-amd-amdhsa-newRTL" PARENT_SCOPE)
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -184,9 +184,15 @@
 extern "C" {
 void *malloc(size_t);
 void free(void *);
+int32_t vprintf(const char *, void *);
 }
 
 EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
 EXTERN void __kmpc_impl_free(void *x) { free(x); }
 
+EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
+                                  uint32_t) {
+  return vprintf(Format, Arguments);
+}
+
 #pragma omp end declare target
Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -184,6 +184,11 @@
 }
 __attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
 
+EXTERN
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
+  return 1;
+}
+
 EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
   hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -35,6 +35,27 @@
          assertion);
   __builtin_trap();
 }
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+int32_t vprintf(const char *, void *);
+namespace impl {
+int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
+  return vprintf(Format, Arguments);
+}
+} // namespace impl
+#pragma omp end declare variant
+
+// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+namespace impl {
+int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { return 1; }
+} // namespace impl
+#pragma omp end declare variant
+
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
+  return impl::omp_vprintf(Format, Arguments, Size);
+}
 }
 
 /// Current indentation level for the function trace. Only accessed by thread 0.
@@ -48,7 +69,7 @@
     for (int I = 0; I < Level; ++I)
       PRINTF("%s", "  ");
 
-    PRINTF("Line %u: Thread %u Entering %s:%u\n", Line,
+    PRINTF("Line %u: Thread %u Entering %s\n", Line,
            mapping::getThreadIdInBlock(), Function);
     Level++;
   }
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -346,6 +346,9 @@
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Printf
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size);
 }
 
 #endif
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -26,23 +26,15 @@
 ///}
 
 /// Print
-/// TODO: For now we have to use macros to guard the code because Clang lowers
-/// `printf` to different function calls on NVPTX and AMDGCN platforms, and it
-/// doesn't work for AMDGCN. After it can work on AMDGCN, we will remove the
-/// macro.
+/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
 /// {
 
-#ifndef __AMDGCN__
 extern "C" {
 int printf(const char *format, ...);
 }
 
-#define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__);
+#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
 #define PRINT(str) PRINTF("%s", str)
-#else
-#define PRINTF(fmt, ...)
-#define PRINT(str)
-#endif
 
 ///}
 
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -4073,6 +4073,8 @@
                                        ReturnValueSlot ReturnValue);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
                                         ReturnValueSlot ReturnValue);
+  RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E,
+                                        ReturnValueSlot ReturnValue);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
Index: clang/lib/CodeGen/CGGPUBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -21,13 +21,14 @@
 using namespace clang;
 using namespace CodeGen;
 
-static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+namespace {
+llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
   llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
                             llvm::Type::getInt8PtrTy(M.getContext())};
   llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
       llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
 
-  if (auto* F = M.getFunction("vprintf")) {
+  if (auto *F = M.getFunction("vprintf")) {
     // Our CUDA system header declares vprintf with the right signature, so
     // nobody else should have been able to declare vprintf with a bogus
     // signature.
@@ -41,6 +42,28 @@
       VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
 }
 
+llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
+  const char *Name = "__llvm_omp_vprintf";
+  llvm::Module &M = CGM.getModule();
+  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
+                            llvm::Type::getInt8PtrTy(M.getContext()),
+                            llvm::Type::getInt32Ty(M.getContext())};
+  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
+      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
+
+  if (auto *F = M.getFunction(Name)) {
+    if (F->getFunctionType() != VprintfFuncType) {
+      CGM.Error(SourceLocation(),
+                "Invalid type declaration for __llvm_omp_vprintf");
+      return nullptr;
+    }
+    return F;
+  }
+
+  return llvm::Function::Create(
+      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
+}
+
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
 // isn't particularly special; it's invoked just like a regular function).
 // vprintf takes two args: A format string, and a pointer to a buffer containing
@@ -66,39 +89,24 @@
 //
 // Note that by the time this function runs, E's args have already undergone the
 // standard C vararg promotion (short -> int, float -> double, etc.).
-RValue
-CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
-                                               ReturnValueSlot ReturnValue) {
-  assert(getTarget().getTriple().isNVPTX());
-  assert(E->getBuiltinCallee() == Builtin::BIprintf);
-  assert(E->getNumArgs() >= 1); // printf always has at least one arg.
 
-  const llvm::DataLayout &DL = CGM.getDataLayout();
-  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+std::pair<llvm::Value *, llvm::TypeSize>
+packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) {
 
-  CallArgList Args;
-  EmitCallArgs(Args,
-               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
-               E->arguments(), E->getDirectCallee(),
-               /* ParamsToSkip = */ 0);
-
-  // We don't know how to emit non-scalar varargs.
-  if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
-        return !A.getRValue(*this).isScalar();
-      })) {
-    CGM.ErrorUnsupported(E, "non-scalar arg to printf");
-    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
-  }
+  const llvm::DataLayout &DL = CGF->CGM.getDataLayout();
+  llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext();
+  CGBuilderTy &Builder = CGF->Builder;
 
   // Construct and fill the args buffer that we'll pass to vprintf.
   llvm::Value *BufferPtr;
   if (Args.size() <= 1) {
-    // If there are no args, pass a null pointer to vprintf.
+    // If there are no args, pass a null pointer and size 0
     BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
+    return {BufferPtr, llvm::TypeSize::Fixed(0)};
   } else {
     llvm::SmallVector<llvm::Type *, 8> ArgTypes;
     for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
-      ArgTypes.push_back(Args[I].getRValue(*this).getScalarVal()->getType());
+      ArgTypes.push_back(Args[I].getRValue(*CGF).getScalarVal()->getType());
 
     // Using llvm::StructType is correct only because printf doesn't accept
     // aggregates.  If we had to handle aggregates here, we'd have to manually
@@ -106,18 +114,43 @@
     // that the alignment of the llvm type was the same as the alignment of the
     // clang type.
     llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args");
-    llvm::Value *Alloca = CreateTempAlloca(AllocaTy);
+    llvm::Value *Alloca = CGF->CreateTempAlloca(AllocaTy);
 
     for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
       llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1);
-      llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+      llvm::Value *Arg = Args[I].getRValue(*CGF).getScalarVal();
       Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
     }
     BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
+    return {BufferPtr, DL.getTypeAllocSize(AllocaTy)};
+  }
+}
+} // namespace
+
+RValue
+CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
+                                               ReturnValueSlot ReturnValue) {
+  assert(getTarget().getTriple().isNVPTX());
+  assert(E->getBuiltinCallee() == Builtin::BIprintf);
+  assert(E->getNumArgs() >= 1); // printf always has at least one arg.
+
+  CallArgList Args;
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg to printf");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
   }
 
+  llvm::Value *BufferPtr = packArgsIntoNVPTXFormatBuffer(this, Args).first;
   // Invoke vprintf and return.
-  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  llvm::Function *VprintfFunc = GetVprintfDeclaration(CGM.getModule());
   return RValue::get(Builder.CreateCall(
       VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }
@@ -154,3 +187,45 @@
   Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
   return RValue::get(Printf);
 }
+
+RValue
+CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E,
+                                                ReturnValueSlot ReturnValue) {
+
+  assert(getTarget().getTriple().isNVPTX() ||
+         getTarget().getTriple().isAMDGCN());
+  assert(E->getBuiltinCallee() == Builtin::BIprintf);
+  assert(E->getNumArgs() >= 1); // printf always has at least one arg.
+
+  // Uses the same format as nvptx for the argument packing, but also passes
+  // an i32 for the total size of the passed pointer
+
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  CallArgList Args;
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg to printf");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+  }
+
+  auto r = packArgsIntoNVPTXFormatBuffer(this, Args);
+  llvm::Value *BufferPtr = r.first;
+
+  // Passing > 32bit of data as a local alloca doesn't work for nvptx or amdgpu
+  llvm::Constant *Size =
+      llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx),
+                             static_cast<uint32_t>(r.second.getFixedSize()));
+
+  // Invoke vprintf and return.
+  llvm::Function *VprintfFunc = GetOpenMPVprintfDeclaration(CGM);
+  return RValue::get(Builder.CreateCall(
+      VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr, Size}));
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5068,11 +5068,16 @@
     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
   }
   case Builtin::BIprintf:
-    if (getTarget().getTriple().isNVPTX())
-      return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
-    if (getTarget().getTriple().getArch() == Triple::amdgcn &&
-        getLangOpts().HIP)
-      return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
+    if (getTarget().getTriple().isNVPTX() ||
+        getTarget().getTriple().isAMDGCN()) {
+      if (getLangOpts().OpenMPIsDevice)
+        return EmitOpenMPDevicePrintfCallExpr(E, ReturnValue);
+      if (getTarget().getTriple().isNVPTX())
+        return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
+      if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
+        return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
+    }
+
     break;
   case Builtin::BI__builtin_canonicalize:
   case Builtin::BI__builtin_canonicalizef:
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to