https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/72556
>From 6ace9d0a51064be189093ca3bb42416aafadb7f6 Mon Sep 17 00:00:00 2001 From: Vikram <vikram.he...@amd.com> Date: Fri, 10 Nov 2023 09:39:41 +0000 Subject: [PATCH 1/3] [AMDGPU] Treat printf as builtin for OpenCL --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 8 ++++++++ clang/lib/AST/Decl.cpp | 7 +++++++ clang/lib/Basic/Targets/AMDGPU.cpp | 2 ++ clang/lib/CodeGen/CGBuiltin.cpp | 5 +++++ 4 files changed, 22 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index a19c8bd5f219ec6..1799c72806bfdd4 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -21,6 +21,10 @@ #if defined(BUILTIN) && !defined(TARGET_BUILTIN) # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif + +#if defined(BUILTIN) && !defined(LANGBUILTIN) +#define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS) +#endif //===----------------------------------------------------------------------===// // SI+ only builtins. //===----------------------------------------------------------------------===// @@ -406,5 +410,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") +// OpenCL +LANGBUILTIN(printf, "icC*4.", "fp:0:", ALL_OCL_LANGUAGES) + #undef BUILTIN #undef TARGET_BUILTIN +#undef LANGBUILTIN diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index c5c2edf1bfe3aba..2597422bdd521a0 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -49,6 +49,7 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetCXXABI.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/Visibility.h" @@ -3598,6 +3599,12 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const { if (!ConsiderWrapperFunctions && getStorageClass() == SC_Static) return 0; + // AMDGCN implementation supports printf as a builtin + // for OpenCL + if (Context.getTargetInfo().getTriple().isAMDGCN() && + Context.getLangOpts().OpenCL && BuiltinID == AMDGPU::BIprintf) + return BuiltinID; + // OpenCL v1.2 s6.9.f - The library functions defined in // the C99 standard headers are not available. if (Context.getLangOpts().OpenCL && diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 409ae32ab424215..307cfa49f54e926 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -91,6 +91,8 @@ static constexpr Builtin::Info BuiltinInfo[] = { {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, +#define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \ + {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, LANG}, #include "clang/Basic/BuiltinsAMDGPU.def" }; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 09309a3937fb613..987909b5a62e11b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2458,6 +2458,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad()) BuiltinID = mutateLongDoubleBuiltin(BuiltinID); + // Mutate the printf builtin ID so that we use the same CodeGen path for + // HIP and OpenCL with AMDGPU targets. + if (getTarget().getTriple().isAMDGCN() && BuiltinID == AMDGPU::BIprintf) + BuiltinID = Builtin::BIprintf; + // If the builtin has been declared explicitly with an assembler label, // disable the specialized emitting below. Ideally we should communicate the // rename in IR, or at least avoid generating the intrinsic calls that are >From 040a28deef5fe7a5d9e357a898b50335992e708d Mon Sep 17 00:00:00 2001 From: Vikram <vikram.he...@amd.com> Date: Mon, 20 Nov 2023 05:26:27 +0000 Subject: [PATCH 2/3] [AMDGPU] Enable OpenCL printf expansion at clang CodeGen --- clang/lib/CodeGen/CGBuiltin.cpp | 3 ++- clang/lib/CodeGen/CGGPUBuiltin.cpp | 25 +++++++++++++++++++------ clang/lib/Driver/ToolChains/Clang.cpp | 10 ++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 987909b5a62e11b..8d51df24c7872b7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5622,7 +5622,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return EmitOpenMPDevicePrintfCallExpr(E); if (getTarget().getTriple().isNVPTX()) return EmitNVPTXDevicePrintfCallExpr(E); - if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP) + if (getTarget().getTriple().isAMDGCN() && + (getLangOpts().HIP || getLangOpts().OpenCL)) return EmitAMDGPUDevicePrintfCallExpr(E); } diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp index de4ee68c0da1e79..81e23bc325339bb 100644 --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -13,6 +13,7 @@ #include "CodeGenFunction.h" #include "clang/Basic/Builtins.h" +#include "clang/Basic/TargetBuiltins.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instruction.h" #include "llvm/Support/MathExtras.h" @@ -177,10 +178,20 @@ RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) { E, this, GetVprintfDeclaration(CGM.getModule()), false); } +// Deterimines if an argument is a string +static bool isString(const clang::Type *argXTy) { + if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) && + argXTy->getPointeeOrArrayElementType()->isCharType()) + return true; + else + return false; +} + RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) { assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn); assert(E->getBuiltinCallee() == Builtin::BIprintf || - E->getBuiltinCallee() == Builtin::BI__builtin_printf); + E->getBuiltinCallee() == Builtin::BI__builtin_printf || + E->getBuiltinCallee() == AMDGPU::BIprintf); assert(E->getNumArgs() >= 1); // printf always has at least one arg. CallArgList CallArgs; @@ -188,6 +199,8 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) { E->getDirectCallee()->getType()->getAs<FunctionProtoType>(), E->arguments(), E->getDirectCallee(), /* ParamsToSkip = */ 0); + llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint()); + IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation()); SmallVector<llvm::Value *, 8> Args; for (const auto &A : CallArgs) { @@ -198,14 +211,14 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) { } llvm::Value *Arg = A.getRValue(*this).getScalarVal(); + if (isString(A.getType().getTypePtr()) && CGM.getLangOpts().OpenCL) + Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy); Args.push_back(Arg); } - llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint()); - IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation()); - - bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal == - clang::TargetOptions::AMDGPUPrintfKind::Buffered); + auto PFK = CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal; + bool isBuffered = + (PFK == clang::TargetOptions::AMDGPUPrintfKind::Buffered); auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args, isBuffered); Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint()); return RValue::get(Printf); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b462f5a44057d94..b63c777fd1f158c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4742,6 +4742,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.ClaimAllArgs(options::OPT_gen_cdb_fragment_path); } + if (TC.getTriple().isAMDGPU() && types::isOpenCL(Input.getType())) { + if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) { + CmdArgs.push_back(Args.MakeArgString( + "-mprintf-kind=" + + Args.getLastArgValue(options::OPT_mprintf_kind_EQ))); + // Force compiler error on invalid conversion specifiers + CmdArgs.push_back(Args.MakeArgString("-Werror=format-invalid-specifier")); + } + } + if (IsCuda || IsHIP) { // We have to pass the triple of the host if compiling for a CUDA/HIP device // and vice-versa. >From b443e11ee074c5ec89cb1072583bad6c8ffaa897 Mon Sep 17 00:00:00 2001 From: Vikram <vikram.he...@amd.com> Date: Mon, 20 Nov 2023 05:28:10 +0000 Subject: [PATCH 3/3] [AMDGPU] Add vector processing support to AMDGPU printf --- clang/test/CodeGenOpenCL/amdgpu-printf.cl | 205 +++++++++++++++++- .../lib/Transforms/Utils/AMDGPUEmitPrintf.cpp | 135 +++++++----- 2 files changed, 288 insertions(+), 52 deletions(-) diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl index edf6dbf8657cbe5..c6cab062a45618e 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=buffered -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_BUFFERED %s +// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=hostcall -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_HOSTCALL %s int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); @@ -7,6 +8,61 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))) // CHECK-NEXT: entry: // CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR4:[0-9]+]] // CHECK-NEXT: ret void +// CHECK_BUFFERED-LABEL: @test_printf_noargs( +// CHECK_BUFFERED-NEXT: entry: +// CHECK_BUFFERED-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK_BUFFERED: strlen.while: +// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[STRLEN_WHILE]] ] +// CHECK_BUFFERED-NEXT: [[TMP1]] = getelementptr i8, ptr [[TMP0]], i64 1 +// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 1 +// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0 +// CHECK_BUFFERED-NEXT: br i1 [[TMP3]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK_BUFFERED: strlen.while.done: +// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP0]] to i64 +// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64) +// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 +// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]] +// CHECK_BUFFERED: strlen.join: +// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP6]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 7 +// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 4294967288 +// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4 +// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP11]]) +// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK_BUFFERED-NEXT: br i1 [[TMP12]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK_BUFFERED: end.block: +// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = xor i1 [[TMP12]], true +// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP13]] to i32 +// CHECK_BUFFERED-NEXT: ret void +// CHECK_BUFFERED: argpush.block: +// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = shl i32 [[TMP11]], 2 +// CHECK_BUFFERED-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP15]], ptr align 1 addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP7]], i1 false) +// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP15]], i64 [[TMP9]] +// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]] +// +// CHECK_HOSTCALL-LABEL: @test_printf_noargs( +// CHECK_HOSTCALL-NEXT: entry: +// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = call i64 @__ockl_printf_begin(i64 0) +// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK_HOSTCALL: strlen.while: +// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[STRLEN_WHILE]] ] +// CHECK_HOSTCALL-NEXT: [[TMP2]] = getelementptr i8, ptr [[TMP1]], i64 1 +// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP1]], align 1 +// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0 +// CHECK_HOSTCALL-NEXT: br i1 [[TMP4]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK_HOSTCALL: strlen.while.done: +// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64) +// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 1 +// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]] +// CHECK_HOSTCALL: strlen.join: +// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = phi i64 [ [[TMP7]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP0]], ptr addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP8]], i32 1) +// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK_HOSTCALL-NEXT: ret void // __kernel void test_printf_noargs() { printf(""); @@ -19,6 +75,53 @@ __kernel void test_printf_noargs() { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR4]] // CHECK-NEXT: ret void +// CHECK_BUFFERED-LABEL: @test_printf_int( +// CHECK_BUFFERED-NEXT: entry: +// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12:![0-9]+]] +// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]] +// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 20) +// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK_BUFFERED-NEXT: br i1 [[TMP1]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK_BUFFERED: end.block: +// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = xor i1 [[TMP1]], true +// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP2]] to i32 +// CHECK_BUFFERED-NEXT: ret void +// CHECK_BUFFERED: argpush.block: +// CHECK_BUFFERED-NEXT: store i32 82, ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK_BUFFERED-NEXT: store i64 -2582314622382785113, ptr addrspace(1) [[TMP3]], align 8 +// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP3]], i32 8 +// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK_BUFFERED-NEXT: store i64 [[TMP5]], ptr addrspace(1) [[TMP4]], align 8 +// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 8 +// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]] +// +// CHECK_HOSTCALL-LABEL: @test_printf_int( +// CHECK_HOSTCALL-NEXT: entry: +// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]] +// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = call i64 @__ockl_printf_begin(i64 0) +// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK_HOSTCALL: strlen.while: +// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.1 to ptr), [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[STRLEN_WHILE]] ] +// CHECK_HOSTCALL-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 1 +// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 +// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 +// CHECK_HOSTCALL-NEXT: br i1 [[TMP5]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK_HOSTCALL: strlen.while.done: +// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP2]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr) to i64) +// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1 +// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]] +// CHECK_HOSTCALL: strlen.join: +// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP1]], ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), i64 [[TMP9]], i32 0) +// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP10]], i32 1, i64 [[TMP11]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1) +// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 +// CHECK_HOSTCALL-NEXT: ret void // __kernel void test_printf_int(int i) { printf("%d", i); @@ -36,6 +139,106 @@ __kernel void test_printf_int(int i) { // CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]] // CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5]] // CHECK-NEXT: ret void +// CHECK_BUFFERED-LABEL: @test_printf_str_int( +// CHECK_BUFFERED-NEXT: entry: +// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK_BUFFERED-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5) +// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]] +// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1:[0-9]+]] +// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false) +// CHECK_BUFFERED-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0 +// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]] +// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr +// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null +// CHECK_BUFFERED-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK_BUFFERED: strlen.while: +// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ] +// CHECK_BUFFERED-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1 +// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 +// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0 +// CHECK_BUFFERED-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK_BUFFERED: strlen.while.done: +// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64 +// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]] +// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1 +// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]] +// CHECK_BUFFERED: strlen.join: +// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7 +// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288 +// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 20 +// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]]) +// CHECK_BUFFERED-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null +// CHECK_BUFFERED-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]] +// CHECK_BUFFERED: end.block: +// CHECK_BUFFERED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true +// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32 +// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1]] +// CHECK_BUFFERED-NEXT: ret void +// CHECK_BUFFERED: argpush.block: +// CHECK_BUFFERED-NEXT: [[TMP18:%.*]] = shl i32 [[TMP15]], 2 +// CHECK_BUFFERED-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], 2 +// CHECK_BUFFERED-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4 +// CHECK_BUFFERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4 +// CHECK_BUFFERED-NEXT: store i64 -2942283388077972797, ptr addrspace(1) [[TMP20]], align 8 +// CHECK_BUFFERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8 +// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP21]], ptr align 1 [[TMP1]], i64 [[TMP11]], i1 false) +// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i64 [[TMP13]] +// CHECK_BUFFERED-NEXT: [[TMP22:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK_BUFFERED-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8 +// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8 +// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]] +// +// CHECK_HOSTCALL-LABEL: @test_printf_str_int( +// CHECK_HOSTCALL-NEXT: entry: +// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK_HOSTCALL-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5) +// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3:[0-9]+]] +// CHECK_HOSTCALL-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false) +// CHECK_HOSTCALL-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0 +// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr +// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = call i64 @__ockl_printf_begin(i64 0) +// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]] +// CHECK_HOSTCALL: strlen.while: +// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.2 to ptr), [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ] +// CHECK_HOSTCALL-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1 +// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 +// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0 +// CHECK_HOSTCALL-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]] +// CHECK_HOSTCALL: strlen.while.done: +// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP3]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr) to i64) +// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1 +// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]] +// CHECK_HOSTCALL: strlen.join: +// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ] +// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), i64 [[TMP10]], i32 0) +// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = icmp eq ptr [[TMP1]], null +// CHECK_HOSTCALL-NEXT: br i1 [[TMP12]], label [[STRLEN_JOIN1:%.*]], label [[STRLEN_WHILE2:%.*]] +// CHECK_HOSTCALL: strlen.while2: +// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = phi ptr [ [[TMP1]], [[STRLEN_JOIN]] ], [ [[TMP14:%.*]], [[STRLEN_WHILE2]] ] +// CHECK_HOSTCALL-NEXT: [[TMP14]] = getelementptr i8, ptr [[TMP13]], i64 1 +// CHECK_HOSTCALL-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP13]], align 1 +// CHECK_HOSTCALL-NEXT: [[TMP16:%.*]] = icmp eq i8 [[TMP15]], 0 +// CHECK_HOSTCALL-NEXT: br i1 [[TMP16]], label [[STRLEN_WHILE_DONE3:%.*]], label [[STRLEN_WHILE2]] +// CHECK_HOSTCALL: strlen.while.done3: +// CHECK_HOSTCALL-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP13]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP19:%.*]] = sub i64 [[TMP18]], [[TMP17]] +// CHECK_HOSTCALL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 1 +// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN1]] +// CHECK_HOSTCALL: strlen.join1: +// CHECK_HOSTCALL-NEXT: [[TMP21:%.*]] = phi i64 [ [[TMP20]], [[STRLEN_WHILE_DONE3]] ], [ 0, [[STRLEN_JOIN]] ] +// CHECK_HOSTCALL-NEXT: [[TMP22:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP11]], ptr [[TMP1]], i64 [[TMP21]], i32 0) +// CHECK_HOSTCALL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP0]] to i64 +// CHECK_HOSTCALL-NEXT: [[TMP24:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP22]], i32 1, i64 [[TMP23]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1) +// CHECK_HOSTCALL-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 +// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3]] +// CHECK_HOSTCALL-NEXT: ret void // __kernel void test_printf_str_int(int i) { char s[] = "foo"; diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp index 2195406c144c8ba..99c714d963d2884 100644 --- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp +++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp @@ -26,28 +26,31 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-emit-printf" -static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) { +static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg, + bool IsBuffered) { + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); auto Int64Ty = Builder.getInt64Ty(); auto Ty = Arg->getType(); if (auto IntTy = dyn_cast<IntegerType>(Ty)) { - switch (IntTy->getBitWidth()) { - case 32: - return Builder.CreateZExt(Arg, Int64Ty); - case 64: - return Arg; + if (IntTy->getBitWidth() < 64) { + return Builder.CreateZExt(Arg, Builder.getInt64Ty()); } } - if (Ty->getTypeID() == Type::DoubleTyID) { + if (Ty->isFloatingPointTy()) { + if (DL.getTypeAllocSize(Ty) < 8) + Arg = Builder.CreateFPExt(Arg, Builder.getDoubleTy()); + if (IsBuffered) + return Arg; return Builder.CreateBitCast(Arg, Int64Ty); } - if (isa<PointerType>(Ty)) { + if (!IsBuffered && isa<PointerType>(Ty)) { return Builder.CreatePtrToInt(Arg, Int64Ty); } - llvm_unreachable("unexpected type"); + return Arg; } static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) { @@ -74,8 +77,8 @@ static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs, } static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, - bool IsLast) { - auto Arg0 = fitArgInto64Bits(Builder, Arg); + bool IsLast, bool IsBuffered) { + auto Arg0 = fitArgInto64Bits(Builder, Arg, IsBuffered); auto Zero = Builder.getInt64(0); return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero, Zero, IsLast); @@ -170,20 +173,49 @@ static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg, return callAppendStringN(Builder, Desc, Arg, Length, IsLast); } +static Value *appendVectorArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, + bool IsLast, bool IsBuffered) { + assert(Arg->getType()->isVectorTy() && "incorrent append* function"); + auto VectorTy = dyn_cast<FixedVectorType>(Arg->getType()); + auto Zero = Builder.getInt64(0); + if (VectorTy) { + for (unsigned int i = 0; i < VectorTy->getNumElements() - 1; i++) { + auto Val = Builder.CreateExtractElement(Arg, i); + Desc = callAppendArgs(Builder, Desc, 1, + fitArgInto64Bits(Builder, Val, IsBuffered), Zero, + Zero, Zero, Zero, Zero, Zero, false); + } + + Value* Val = + Builder.CreateExtractElement(Arg, VectorTy->getNumElements() - 1); + return callAppendArgs(Builder, Desc, 1, + fitArgInto64Bits(Builder, Val, IsBuffered), Zero, + Zero, Zero, Zero, Zero, Zero, IsLast); + } + return nullptr; +} + static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg, - bool SpecIsCString, bool IsLast) { + bool SpecIsCString, bool IsVector, bool IsLast, + bool IsBuffered) { if (SpecIsCString && isa<PointerType>(Arg->getType())) { return appendString(Builder, Desc, Arg, IsLast); } - // If the format specifies a string but the argument is not, the frontend will - // have printed a warning. We just rely on undefined behaviour and send the - // argument anyway. - return appendArg(Builder, Desc, Arg, IsLast); + + if (IsVector) { + return appendVectorArg(Builder, Desc, Arg, IsLast, IsBuffered); + } + + // If the format specifies a string but the argument is not, the frontend + // will have printed a warning. We just rely on undefined behaviour and send + // the argument anyway. + return appendArg(Builder, Desc, Arg, IsLast, IsBuffered); } // Scan the format string to locate all specifiers, and mark the ones that // specify a string, i.e, the "%s" specifier with optional '*' characters. -static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) { +static void locateCStringsAndVectors(SparseBitVector<8> &BV, + SparseBitVector<8> &OV, StringRef Str) { static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn"; size_t SpecPos = 0; // Skip the first argument, the format string. @@ -194,6 +226,8 @@ static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) { SpecPos += 2; continue; } + if (Str.find_first_of("v", SpecPos) != StringRef::npos) + OV.set(ArgIdx); auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos); if (SpecEnd == StringRef::npos) return; @@ -224,7 +258,8 @@ struct StringData { static Value *callBufferedPrintfStart( IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *Fmt, bool isConstFmtStr, SparseBitVector<8> &SpecIsCString, - SmallVectorImpl<StringData> &StringContents, Value *&ArgSize) { + SparseBitVector<8> &OCLVectors, SmallVectorImpl<StringData> &StringContents, + Value *&ArgSize) { Module *M = Builder.GetInsertBlock()->getModule(); Value *NonConstStrLen = nullptr; Value *LenWithNull = nullptr; @@ -278,7 +313,12 @@ static Value *callBufferedPrintfStart( StringData(StringRef(), LenWithNull, LenWithNullAligned, false)); } } else { - int AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType()); + int AllocSize = 0; + if (OCLVectors.test(i)) { + auto VecArg = cast<FixedVectorType>(Args[i]->getType()); + AllocSize = VecArg->getNumElements() * 8; + } else + AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType()); // We end up expanding non string arguments to 8 bytes // (args smaller than 8 bytes) BufSize += std::max(AllocSize, 8); @@ -352,30 +392,10 @@ static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder, WhatToStore.push_back(ConstantInt::get(Builder.getInt32Ty(), 0)); } -static Value *processNonStringArg(Value *Arg, IRBuilder<> &Builder) { - const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); - auto Ty = Arg->getType(); - - if (auto IntTy = dyn_cast<IntegerType>(Ty)) { - if (IntTy->getBitWidth() < 64) { - return Builder.CreateZExt(Arg, Builder.getInt64Ty()); - } - } - - if (Ty->isFloatingPointTy()) { - if (DL.getTypeAllocSize(Ty) < 8) { - return Builder.CreateFPExt(Arg, Builder.getDoubleTy()); - } - } - - return Arg; -} - -static void -callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args, - Value *PtrToStore, SparseBitVector<8> &SpecIsCString, - SmallVectorImpl<StringData> &StringContents, - bool IsConstFmtStr) { +static void callBufferedPrintfArgPush( + IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *PtrToStore, + SparseBitVector<8> &SpecIsCString, SparseBitVector<8> &OCLVectors, + SmallVectorImpl<StringData> &StringContents, bool IsConstFmtStr) { Module *M = Builder.GetInsertBlock()->getModule(); const DataLayout &DL = M->getDataLayout(); auto StrIt = StringContents.begin(); @@ -407,7 +427,17 @@ callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args, continue; } } else { - WhatToStore.push_back(processNonStringArg(Args[i], Builder)); + if (OCLVectors.test(i)) { + auto VectorTy = dyn_cast<FixedVectorType>(Args[i]->getType()); + auto VecArg = Args[i]; + for (unsigned int Num = 0; Num < VectorTy->getNumElements(); Num++) { + auto Val = Builder.CreateExtractElement(VecArg, Num); + WhatToStore.push_back( + fitArgInto64Bits(Builder, Val, /*IsBuffered*/ true)); + } + } else + WhatToStore.push_back( + fitArgInto64Bits(Builder, Args[i], /*IsBuffered*/ true)); } for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) { @@ -434,10 +464,11 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args, auto Fmt = Args[0]; SparseBitVector<8> SpecIsCString; + SparseBitVector<8> OCLVectors; StringRef FmtStr; if (getConstantStringInfo(Fmt, FmtStr)) - locateCStrings(SpecIsCString, FmtStr); + locateCStringsAndVectors(SpecIsCString, OCLVectors, FmtStr); if (IsBuffered) { SmallVector<StringData, 8> StringContents; @@ -448,9 +479,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args, bool IsConstFmtStr = !FmtStr.empty(); Value *ArgSize = nullptr; - Value *Ptr = - callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr, - SpecIsCString, StringContents, ArgSize); + Value *Ptr = callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr, + SpecIsCString, OCLVectors, + StringContents, ArgSize); // The buffered version still follows OpenCL printf standards for // printf return value, i.e 0 on success, -1 on failure. @@ -513,8 +544,8 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args, } // Push The printf arguments onto buffer - callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, StringContents, - IsConstFmtStr); + callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, OCLVectors, + StringContents, IsConstFmtStr); // End block, returns -1 on failure BranchInst::Create(End, ArgPush); @@ -531,7 +562,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args, for (unsigned int i = 1; i != NumOps; ++i) { bool IsLast = i == NumOps - 1; bool IsCString = SpecIsCString.test(i); - Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast); + bool IsVector = OCLVectors.test(i); + Desc = processArg(Builder, Desc, Args[i], IsCString, IsVector, IsLast, + IsBuffered); } return Builder.CreateTrunc(Desc, Builder.getInt32Ty()); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits