https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/102070
Backport 1762e01cca0186f1862db561cfd9019164b8c654 Requested by: @efriedma-quic >From c0d8a44634041ad0a03a37a1af49e785c3145497 Mon Sep 17 00:00:00 2001 From: Eli Friedman <efrie...@quicinc.com> Date: Thu, 1 Aug 2024 16:18:20 -0700 Subject: [PATCH] Fix codegen of consteval functions returning an empty class, and related issues (#93115) Fix codegen of consteval functions returning an empty class, and related issues If a class is empty, don't store it to memory: the store might overwrite useful data. Similarly, if a class has tail padding that might overlap other fields, don't store the tail padding to memory. The problem here turned out a bit more general than I initially thought: basically all uses of EmitAggregateStore were broken. Call lowering had a method that did mostly the right thing, though: CreateCoercedStore. Adapt CreateCoercedStore so it always does the conservatively right thing, and use it for both calls and ConstantExpr. Also, along the way, fix the "overlap" bit in AggValueSlot: the bit was set incorrectly for empty classes in some cases. Fixes #93040. (cherry picked from commit 1762e01cca0186f1862db561cfd9019164b8c654) --- clang/lib/CodeGen/CGCall.cpp | 146 ++++++++---------- clang/lib/CodeGen/CGExprAgg.cpp | 23 +-- clang/lib/CodeGen/CodeGenFunction.h | 7 +- clang/test/CodeGen/arm-mve-intrinsics/vld24.c | 43 ++++-- clang/test/CodeGen/arm-vfp16-arguments2.cpp | 10 +- .../amdgpu-kernel-arg-pointer-type.cu | 11 +- clang/test/CodeGenCUDA/builtins-amdgcn.cu | 125 +++++++-------- .../test/CodeGenCUDA/builtins-spirv-amdgcn.cu | 123 +++++++-------- .../CodeGenCXX/address-space-cast-coerce.cpp | 6 +- clang/test/CodeGenCXX/cxx2a-consteval.cpp | 24 ++- clang/test/CodeGenCXX/trivial_abi.cpp | 20 +++ clang/test/CodeGenHIP/dpp-const-fold.hip | 8 +- .../spirv-amdgcn-dpp-const-fold.hip | 8 +- .../CodeGenOpenCL/addr-space-struct-arg.cl | 11 +- .../amdgpu-abi-struct-arg-byref.cl | 42 +++-- 15 files changed, 320 insertions(+), 287 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 234a9c16e39df..70e2f37eb40e1 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1336,75 +1336,50 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty, return CGF.Builder.CreateLoad(Tmp); } -// Function to store a first-class aggregate into memory. We prefer to -// store the elements rather than the aggregate to be more friendly to -// fast-isel. -// FIXME: Do we need to recurse here? -void CodeGenFunction::EmitAggregateStore(llvm::Value *Val, Address Dest, - bool DestIsVolatile) { - // Prefer scalar stores to first-class aggregate stores. - if (llvm::StructType *STy = dyn_cast<llvm::StructType>(Val->getType())) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Address EltPtr = Builder.CreateStructGEP(Dest, i); - llvm::Value *Elt = Builder.CreateExtractValue(Val, i); - Builder.CreateStore(Elt, EltPtr, DestIsVolatile); - } - } else { - Builder.CreateStore(Val, Dest, DestIsVolatile); - } -} - -/// CreateCoercedStore - Create a store to \arg DstPtr from \arg Src, -/// where the source and destination may have different types. The -/// destination is known to be aligned to \arg DstAlign bytes. -/// -/// This safely handles the case when the src type is larger than the -/// destination type; the upper bits of the src will be lost. -static void CreateCoercedStore(llvm::Value *Src, - Address Dst, - bool DstIsVolatile, - CodeGenFunction &CGF) { - llvm::Type *SrcTy = Src->getType(); - llvm::Type *DstTy = Dst.getElementType(); - if (SrcTy == DstTy) { - CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); - return; - } - - llvm::TypeSize SrcSize = CGF.CGM.getDataLayout().getTypeAllocSize(SrcTy); - - if (llvm::StructType *DstSTy = dyn_cast<llvm::StructType>(DstTy)) { - Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy, - SrcSize.getFixedValue(), CGF); - DstTy = Dst.getElementType(); - } - - llvm::PointerType *SrcPtrTy = llvm::dyn_cast<llvm::PointerType>(SrcTy); - llvm::PointerType *DstPtrTy = llvm::dyn_cast<llvm::PointerType>(DstTy); - if (SrcPtrTy && DstPtrTy && - SrcPtrTy->getAddressSpace() != DstPtrTy->getAddressSpace()) { - Src = CGF.Builder.CreateAddrSpaceCast(Src, DstTy); - CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); +void CodeGenFunction::CreateCoercedStore(llvm::Value *Src, Address Dst, + llvm::TypeSize DstSize, + bool DstIsVolatile) { + if (!DstSize) return; - } - // If the source and destination are integer or pointer types, just do an - // extension or truncation to the desired type. - if ((isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy)) && - (isa<llvm::IntegerType>(DstTy) || isa<llvm::PointerType>(DstTy))) { - Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, CGF); - CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); - return; + llvm::Type *SrcTy = Src->getType(); + llvm::TypeSize SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy); + + // GEP into structs to try to make types match. + // FIXME: This isn't really that useful with opaque types, but it impacts a + // lot of regression tests. + if (SrcTy != Dst.getElementType()) { + if (llvm::StructType *DstSTy = + dyn_cast<llvm::StructType>(Dst.getElementType())) { + assert(!SrcSize.isScalable()); + Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy, + SrcSize.getFixedValue(), *this); + } } - llvm::TypeSize DstSize = CGF.CGM.getDataLayout().getTypeAllocSize(DstTy); - - // If store is legal, just bitcast the src pointer. - if (isa<llvm::ScalableVectorType>(SrcTy) || - isa<llvm::ScalableVectorType>(DstTy) || - SrcSize.getFixedValue() <= DstSize.getFixedValue()) { - Dst = Dst.withElementType(SrcTy); - CGF.EmitAggregateStore(Src, Dst, DstIsVolatile); + if (SrcSize.isScalable() || SrcSize <= DstSize) { + if (SrcTy->isIntegerTy() && Dst.getElementType()->isPointerTy() && + SrcSize == CGM.getDataLayout().getTypeAllocSize(Dst.getElementType())) { + // If the value is supposed to be a pointer, convert it before storing it. + Src = CoerceIntOrPtrToIntOrPtr(Src, Dst.getElementType(), *this); + Builder.CreateStore(Src, Dst, DstIsVolatile); + } else if (llvm::StructType *STy = + dyn_cast<llvm::StructType>(Src->getType())) { + // Prefer scalar stores to first-class aggregate stores. + Dst = Dst.withElementType(SrcTy); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Address EltPtr = Builder.CreateStructGEP(Dst, i); + llvm::Value *Elt = Builder.CreateExtractValue(Src, i); + Builder.CreateStore(Elt, EltPtr, DstIsVolatile); + } + } else { + Builder.CreateStore(Src, Dst.withElementType(SrcTy), DstIsVolatile); + } + } else if (SrcTy->isIntegerTy()) { + // If the source is a simple integer, coerce it directly. + llvm::Type *DstIntTy = Builder.getIntNTy(DstSize.getFixedValue() * 8); + Src = CoerceIntOrPtrToIntOrPtr(Src, DstIntTy, *this); + Builder.CreateStore(Src, Dst.withElementType(DstIntTy), DstIsVolatile); } else { // Otherwise do coercion through memory. This is stupid, but // simple. @@ -1416,12 +1391,12 @@ static void CreateCoercedStore(llvm::Value *Src, // FIXME: Assert that we aren't truncating non-padding bits when have access // to that information. RawAddress Tmp = - CreateTempAllocaForCoercion(CGF, SrcTy, Dst.getAlignment()); - CGF.Builder.CreateStore(Src, Tmp); - CGF.Builder.CreateMemCpy( - Dst.emitRawPointer(CGF), Dst.getAlignment().getAsAlign(), - Tmp.getPointer(), Tmp.getAlignment().getAsAlign(), - llvm::ConstantInt::get(CGF.IntPtrTy, DstSize.getFixedValue())); + CreateTempAllocaForCoercion(*this, SrcTy, Dst.getAlignment()); + Builder.CreateStore(Src, Tmp); + Builder.CreateMemCpy(Dst.emitRawPointer(*this), + Dst.getAlignment().getAsAlign(), Tmp.getPointer(), + Tmp.getAlignment().getAsAlign(), + Builder.CreateTypeSize(IntPtrTy, DstSize)); } } @@ -3309,7 +3284,12 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, assert(NumIRArgs == 1); auto AI = Fn->getArg(FirstIRArg); AI->setName(Arg->getName() + ".coerce"); - CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, *this); + CreateCoercedStore( + AI, Ptr, + llvm::TypeSize::getFixed( + getContext().getTypeSizeInChars(Ty).getQuantity() - + ArgI.getDirectOffset()), + /*DstIsVolatile=*/false); } // Match to what EmitParmDecl is expecting for this type. @@ -5939,17 +5919,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, llvm::Value *Imag = Builder.CreateExtractValue(CI, 1); return RValue::getComplex(std::make_pair(Real, Imag)); } - case TEK_Aggregate: { - Address DestPtr = ReturnValue.getAddress(); - bool DestIsVolatile = ReturnValue.isVolatile(); - - if (!DestPtr.isValid()) { - DestPtr = CreateMemTemp(RetTy, "agg.tmp"); - DestIsVolatile = false; - } - EmitAggregateStore(CI, DestPtr, DestIsVolatile); - return RValue::getAggregate(DestPtr); - } + case TEK_Aggregate: + break; case TEK_Scalar: { // If the argument doesn't match, perform a bitcast to coerce it. // This can happen due to trivial type mismatches. @@ -5959,7 +5930,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, return RValue::get(V); } } - llvm_unreachable("bad evaluation kind"); } // If coercing a fixed vector from a scalable vector for ABI @@ -5981,10 +5951,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, Address DestPtr = ReturnValue.getValue(); bool DestIsVolatile = ReturnValue.isVolatile(); + uint64_t DestSize = + getContext().getTypeInfoDataSizeInChars(RetTy).Width.getQuantity(); if (!DestPtr.isValid()) { DestPtr = CreateMemTemp(RetTy, "coerce"); DestIsVolatile = false; + DestSize = getContext().getTypeSizeInChars(RetTy).getQuantity(); } // An empty record can overlap other data (if declared with @@ -5993,7 +5966,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (!isEmptyRecord(getContext(), RetTy, true)) { // If the value is offset in memory, apply the offset now. Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI); - CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this); + CreateCoercedStore( + CI, StorePtr, + llvm::TypeSize::getFixed(DestSize - RetAI.getDirectOffset()), + DestIsVolatile); } return convertTempToRValue(DestPtr, RetTy, SourceLocation()); diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index c3c10e73ff05e..d9f44f4be617e 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -131,15 +131,12 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> { EnsureDest(E->getType()); if (llvm::Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) { - Address StoreDest = Dest.getAddress(); - // The emitted value is guaranteed to have the same size as the - // destination but can have a different type. Just do a bitcast in this - // case to avoid incorrect GEPs. - if (Result->getType() != StoreDest.getType()) - StoreDest = StoreDest.withElementType(Result->getType()); - - CGF.EmitAggregateStore(Result, StoreDest, - E->getType().isVolatileQualified()); + CGF.CreateCoercedStore( + Result, Dest.getAddress(), + llvm::TypeSize::getFixed( + Dest.getPreferredSize(CGF.getContext(), E->getType()) + .getQuantity()), + E->getType().isVolatileQualified()); return; } return Visit(E->getSubExpr()); @@ -2050,6 +2047,10 @@ CodeGenFunction::getOverlapForFieldInit(const FieldDecl *FD) { if (!FD->hasAttr<NoUniqueAddressAttr>() || !FD->getType()->isRecordType()) return AggValueSlot::DoesNotOverlap; + // Empty fields can overlap earlier fields. + if (FD->getType()->getAsCXXRecordDecl()->isEmpty()) + return AggValueSlot::MayOverlap; + // If the field lies entirely within the enclosing class's nvsize, its tail // padding cannot overlap any already-initialized object. (The only subobjects // with greater addresses that might already be initialized are vbases.) @@ -2072,6 +2073,10 @@ AggValueSlot::Overlap_t CodeGenFunction::getOverlapForBaseInit( if (IsVirtual) return AggValueSlot::MayOverlap; + // Empty bases can overlap earlier bases. + if (BaseRD->isEmpty()) + return AggValueSlot::MayOverlap; + // If the base class is laid out entirely within the nvsize of the derived // class, its tail padding cannot yet be initialized, so we can issue // stores at the full width of the base class. diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ba7b565d97559..60e6841e1b3d6 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4838,9 +4838,10 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitAggFinalDestCopy(QualType Type, AggValueSlot Dest, const LValue &Src, ExprValueKind SrcKind); - /// Build all the stores needed to initialize an aggregate at Dest with the - /// value Val. - void EmitAggregateStore(llvm::Value *Val, Address Dest, bool DestIsVolatile); + /// Create a store to \arg DstPtr from \arg Src, truncating the stored value + /// to at most \arg DstSize bytes. + void CreateCoercedStore(llvm::Value *Src, Address Dst, llvm::TypeSize DstSize, + bool DstIsVolatile); /// EmitExtendGCLifetime - Given a pointer to an Objective-C object, /// make sure it survives garbage collection until this point. diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c index 03c870e281549..15619bef5373d 100644 --- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c @@ -48,10 +48,13 @@ uint8x16x4_t test_vld4q_u8(const uint8_t *addr) // CHECK-LABEL: @test_vst2q_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[VALUE_COERCE]], 0, 1 -// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0) -// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 0) +// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 1) // CHECK-NEXT: ret void // void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value) @@ -65,14 +68,19 @@ void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value) // CHECK-LABEL: @test_vst4q_s8( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 1 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 2 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 3 -// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 0) -// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 1) -// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 2) -// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 2 +// CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[DOTFCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[DOTFCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 0) +// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 1) +// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 2) +// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 3) // CHECK-NEXT: ret void // void test_vst4q_s8(int8_t *addr, int8x16x4_t value) @@ -86,10 +94,13 @@ void test_vst4q_s8(int8_t *addr, int8x16x4_t value) // CHECK-LABEL: @test_vst2q_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0 -// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[VALUE_COERCE]], 0, 1 -// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0) -// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 0) +// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 1) // CHECK-NEXT: ret void // void test_vst2q_f16(float16_t *addr, float16x8x2_t value) diff --git a/clang/test/CodeGen/arm-vfp16-arguments2.cpp b/clang/test/CodeGen/arm-vfp16-arguments2.cpp index 6221e85e856b4..b7c6852c47b7f 100644 --- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp +++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp @@ -44,20 +44,20 @@ struct S1 f1(struct S1 s1) { return s1; } // CHECK-SOFT: define{{.*}} void @_Z2f22S2(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S2) align 8 %agg.result, [4 x i32] %s2.coerce) // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f22S2([2 x <2 x i32>] returned %s2.coerce) -// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 returned %s2.coerce) +// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 %s2.coerce) struct S2 f2(struct S2 s2) { return s2; } // CHECK-SOFT: define{{.*}} void @_Z2f32S3(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S3) align 8 %agg.result, [2 x i64] %s3.coerce) // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f32S3([2 x <2 x i32>] returned %s3.coerce) -// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 returned %s3.coerce) +// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 %s3.coerce) struct S3 f3(struct S3 s3) { return s3; } // CHECK-SOFT: define{{.*}} void @_Z2f42S4(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S4) align 8 %agg.result, [2 x i64] %s4.coerce) // CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f42S4([2 x <2 x i32>] returned %s4.coerce) -// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 returned %s4.coerce) +// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 %s4.coerce) struct S4 f4(struct S4 s4) { return s4; } // CHECK-SOFT: define{{.*}} void @_Z2f52S5(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S5) align 8 %agg.result, [2 x i64] %s5.coerce) -// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce) -// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce) +// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce) +// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce) struct S5 f5(struct S5 s5) { return s5; } diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index a5135ab01f0f3..70c86cbb8c3d4 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -16,9 +16,8 @@ // HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x) // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce) -// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD:[0-9]+]] +// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}} // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4 // OPT: ret void @@ -28,9 +27,8 @@ __global__ void kernel1(int *x) { // HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x) // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce) -// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD]] +// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}} // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 // OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4 // OPT: ret void @@ -67,7 +65,7 @@ struct S { // OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8 // OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8 // OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1) -// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]] +// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD:[0-9]+]] // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1 // OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4 // OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4 @@ -126,9 +124,8 @@ struct SS { }; // HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce) // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce) -// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr // CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4, !amdgpu.noclobber ![[MD]] +// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4{{$}} // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00 // OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4 // OPT: ret void diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu index 2e88afac813f4..4bf23e529c7a5 100644 --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -17,17 +17,16 @@ // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DISPATCH_PTR]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr -// CHECK-NEXT: store ptr [[TMP2]], ptr [[DISPATCH_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // __global__ void use_dispatch_ptr(int* out) { @@ -43,17 +42,16 @@ __global__ void use_dispatch_ptr(int* out) { // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[QUEUE_PTR]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr -// CHECK-NEXT: store ptr [[TMP2]], ptr [[QUEUE_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +// CHECK-NEXT: store ptr [[TMP1]], ptr [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // __global__ void use_queue_ptr(int* out) { @@ -69,17 +67,16 @@ __global__ void use_queue_ptr(int* out) { // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMPLICITARG_PTR]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr -// CHECK-NEXT: store ptr [[TMP2]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +// CHECK-NEXT: store ptr [[TMP1]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // CHECK-NEXT: ret void // __global__ void use_implicitarg_ptr(int* out) { @@ -134,16 +131,15 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 -// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4 +// CHECK-NEXT: store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // __global__ void test_ds_fmin(float src, float *shared) { @@ -184,17 +180,16 @@ __global__ void endpgm() { // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[TMP3]], align 8 // CHECK-NEXT: ret void // __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b) @@ -210,13 +205,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[TMP1]], align 8 // CHECK-NEXT: ret void // __global__ void test_s_memtime(unsigned long long* out) @@ -237,18 +231,17 @@ __device__ void func(float *x); // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3) -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 -// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4 +// CHECK-NEXT: store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP4]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: ret void // __global__ void test_ds_fmin_func(float src, float *__restrict shared) { @@ -264,14 +257,13 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) { // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8 // CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP1]]) -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8 -// CHECK-NEXT: store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP0]]) +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1 // CHECK-NEXT: ret void // __global__ void test_is_shared(float *x){ @@ -286,14 +278,13 @@ __global__ void test_is_shared(float *x){ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr -// CHECK-NEXT: store ptr [[TMP0]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8 // CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP1]]) -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8 -// CHECK-NEXT: store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP0]]) +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1 // CHECK-NEXT: ret void // __global__ void test_is_private(int *x){ diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu index 32851805298f1..1cbe358910b85 100644 --- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -17,16 +17,15 @@ // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4 // CHECK-NEXT: ret void // __global__ void use_dispatch_ptr(int* out) { @@ -42,16 +41,15 @@ __global__ void use_dispatch_ptr(int* out) { // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() -// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4 // CHECK-NEXT: ret void // __global__ void use_queue_ptr(int* out) { @@ -67,16 +65,15 @@ __global__ void use_queue_ptr(int* out) { // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4 // CHECK-NEXT: ret void // __global__ void use_implicitarg_ptr(int* out) { @@ -131,16 +128,15 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 -// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4 +// CHECK-NEXT: store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4 // CHECK-NEXT: ret void // __global__ void test_ds_fmin(float src, float *shared) { @@ -175,17 +171,16 @@ __global__ void endpgm() { // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35) -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i64 [[TMP3]], ptr addrspace(4) [[TMP4]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8 // CHECK-NEXT: ret void // __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b) @@ -201,13 +196,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime() -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 -// CHECK-NEXT: store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8 // CHECK-NEXT: ret void // __global__ void test_s_memtime(unsigned long long* out) @@ -228,18 +222,17 @@ __device__ void func(float *x); // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) -// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4 -// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4 +// CHECK-NEXT: store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR6:[0-9]+]] // CHECK-NEXT: ret void // __global__ void test_ds_fmin_func(float src, float *__restrict shared) { @@ -255,15 +248,14 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) { // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr -// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP2]]) -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 -// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP1]]) +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1 // CHECK-NEXT: ret void // __global__ void test_is_shared(float *x){ @@ -278,15 +270,14 @@ __global__ void test_is_shared(float *x){ // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr -// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP2]]) -// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 -// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr +// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP1]]) +// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8 +// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1 // CHECK-NEXT: ret void // __global__ void test_is_private(int *x){ diff --git a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp index 7279b6c7f23a0..1ad46042b6efd 100644 --- a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp +++ b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp @@ -46,9 +46,9 @@ int mane() { char1 f1{1}; char1 f2{1}; -// CHECK: [[TMP:%.+]] = alloca i16 -// CHECK: [[COERCE:%.+]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %{{.+}}, ptr align 2 [[COERCE]], i64 1, i1 false) +// CHECK: [[CALL:%.*]] = call i16 +// CHECK: [[TRUNC:%.*]] = trunc i16 [[CALL]] to i8 +// CHECK: store i8 [[TRUNC]] char1 f3 = f1 + f2; } diff --git a/clang/test/CodeGenCXX/cxx2a-consteval.cpp b/clang/test/CodeGenCXX/cxx2a-consteval.cpp index 075cab58358ab..6c09053a74d24 100644 --- a/clang/test/CodeGenCXX/cxx2a-consteval.cpp +++ b/clang/test/CodeGenCXX/cxx2a-consteval.cpp @@ -1,4 +1,3 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -emit-llvm %s -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll // RUN: FileCheck -check-prefix=EVAL -input-file=%t.ll %s // RUN: FileCheck -check-prefix=EVAL-STATIC -input-file=%t.ll %s @@ -275,3 +274,26 @@ void f() { // EVAL-FN: call void @_ZN7GH821542S3C2Ei } } + +namespace GH93040 { +struct C { char c = 1; }; +struct Empty { consteval Empty() {} }; +struct Empty2 { consteval Empty2() {} }; +struct Test : C, Empty { + [[no_unique_address]] Empty2 e; +}; +static_assert(sizeof(Test) == 1); +void f() { + Test test; + +// Make sure we don't overwrite the initialization of c. + +// EVAL-FN-LABEL: define {{.*}} void @_ZN7GH930404TestC2Ev +// EVAL-FN: entry: +// EVAL-FN-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// EVAL-FN-NEXT: store ptr {{.*}}, ptr [[THIS_ADDR]], align 8 +// EVAL-FN-NEXT: [[THIS:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// EVAL-FN-NEXT: call void @_ZN7GH930401CC2Ev(ptr noundef nonnull align 1 dereferenceable(1) [[THIS]]) +// EVAL-FN-NEXT: ret void +} +} diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp index 3012b0f2bc33d..54912a617c287 100644 --- a/clang/test/CodeGenCXX/trivial_abi.cpp +++ b/clang/test/CodeGenCXX/trivial_abi.cpp @@ -262,6 +262,26 @@ void testExceptionLarge() { calleeExceptionLarge(Large(), Large()); } +// CHECK: define void @_ZN7GH930401gEPNS_1SE +// CHECK: [[CALL:%.*]] = call i64 @_ZN7GH930401fEv +// CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[CALL]] to i56 +// CHECK-NEXT: store i56 [[TRUNC]] +// CHECK-NEXT: ret void +void* operator new(unsigned long, void*); +namespace GH93040 { +struct [[clang::trivial_abi]] S { + char a; + int x; + __attribute((aligned(2))) char y; + S(); +} __attribute((packed)); +S f(); +void g(S* s) { new(s) S(f()); } +struct S2 { [[no_unique_address]] S s; char c;}; +static_assert(sizeof(S) == 8 && sizeof(S2) == 8, ""); +} + + // PR42961 // CHECK: define{{.*}} @"_ZN3$_08__invokeEv"() diff --git a/clang/test/CodeGenHIP/dpp-const-fold.hip b/clang/test/CodeGenHIP/dpp-const-fold.hip index f5a97c6b0e77f..c5450ec4b8413 100644 --- a/clang/test/CodeGenHIP/dpp-const-fold.hip +++ b/clang/test/CodeGenHIP/dpp-const-fold.hip @@ -22,25 +22,25 @@ constexpr static bool BountCtrl() return true & false; } -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false) +// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false); } -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false) +// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false); } -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false) +// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false); } -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false) +// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl()); diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip index 2b785200e8eea..71270bc1c68d8 100644 --- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip +++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip @@ -21,25 +21,25 @@ constexpr static bool BountCtrl() return true & false; } -// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false) +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false); } -// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false) +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false); } -// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false) +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false); } -// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false) +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false) __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b) { *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl()); diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index 385f8a753cd8e..1651cb379a20f 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -145,7 +145,9 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) { // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember( // AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructOneMember %u.coerce, ptr addrspace(5) %[[U]], align 8 +// AMDGCN: %[[U_ELEM:.*]] = getelementptr inbounds %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0 +// AMDGCN: %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0 +// AMDGCN: store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8 // AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); @@ -177,7 +179,12 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember // AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) // AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], ptr addrspace(5) %[[u]] +// AMDGCN: %[[U_PTR0:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0 +// AMDGCN: %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0 +// AMDGCN: store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]] +// AMDGCN: %[[U_PTR1:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1 +// AMDGCN: %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1 +// AMDGCN: store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]] // AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index fa83a38a01b0a..fe0a2f9578db0 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -61,7 +61,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // the return value. // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker -// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 { +// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -74,7 +74,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @foo([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] // AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 // AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 @@ -98,7 +98,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large -// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !7 { +// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -168,7 +168,7 @@ void test_indirect_arg_globl(void) { #endif // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local -// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space !9 !kernel_arg_access_qual !9 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !9 { +// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) @@ -193,7 +193,7 @@ void test_indirect_arg_private(void) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember -// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !13 { +// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 @@ -208,7 +208,7 @@ kernel void KernelOneMember(struct StructOneMember u) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir -// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !14 !kernel_arg_access_qual !11 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !13 { +// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 @@ -223,10 +223,12 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember -// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !13 { +// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 // AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // @@ -271,15 +273,20 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember -// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !13 { +// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: store [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR3]] +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // kernel void KernelTwoMember(struct StructTwoMember u) { @@ -287,10 +294,15 @@ kernel void KernelTwoMember(struct StructTwoMember u) { } // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember -// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !13 { +// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 // AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits