llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen Author: Paul Walker (paulwalker-arm) <details> <summary>Changes</summary> I missed a codepath during PR108008 so SME2/SVE2p1 builtins are converting their struct return type into a large vector, which is causing unnecessary casting via memory. --- Patch is 2.20 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109423.diff 30 Files Affected: - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+25-41) - (modified) clang/lib/CodeGen/CodeGenFunction.h (+2-6) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c (+48-480) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c (+18-176) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c (+2-16) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_faminmax.c (+24-240) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c (+16-160) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c (+18-144) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c (+18-216) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c (+18-144) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c (+14-168) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c (+96-960) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c (+32-320) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c (+96-960) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c (+32-320) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c (+145-1441) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c (+32-320) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c (+12-96) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c (+12-144) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c (+32-320) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c (+64-640) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c (+24-192) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c (+24-288) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c (+48-384) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c (+48-576) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c (+48-384) - (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c (+48-576) - (modified) clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c (+144-1440) - (modified) clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c (+40-104) - (modified) clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c (+160-416) ``````````diff diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3d1138b7773853..85c6065b960b8b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9845,6 +9845,22 @@ Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred, return C; } +Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple, + llvm::StructType *Ty) { + if (PredTuple->getType() == Ty) + return PredTuple; + + Value *Ret = llvm::PoisonValue::get(Ty); + for (unsigned I = 0; I < Ty->getNumElements(); ++I) { + Value *Pred = Builder.CreateExtractValue(PredTuple, I); + Pred = EmitSVEPredicateCast( + Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I))); + Ret = Builder.CreateInsertValue(Ret, Pred, I); + } + + return Ret; +} + Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { @@ -10351,41 +10367,6 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags, return Tuple; } -Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) { - // Multi-vector results should be broken up into a single (wide) result - // vector. - auto *StructTy = dyn_cast<StructType>(Call->getType()); - if (!StructTy) - return Call; - - auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U)); - if (!VTy) - return Call; - unsigned N = StructTy->getNumElements(); - - // We may need to emit a cast to a svbool_t - bool IsPredTy = VTy->getElementType()->isIntegerTy(1); - unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements(); - - ScalableVectorType *WideVTy = - ScalableVectorType::get(VTy->getElementType(), MinElts * N); - Value *Ret = llvm::PoisonValue::get(WideVTy); - for (unsigned I = 0; I < N; ++I) { - Value *SRet = Builder.CreateExtractValue(Call, I); - assert(SRet->getType() == VTy && "Unexpected type for result value"); - Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts); - - if (IsPredTy) - SRet = EmitSVEPredicateCast( - SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16)); - - Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx); - } - Call = Ret; - - return Call; -} - void CodeGenFunction::GetAArch64SVEProcessedOperands( unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops, SVETypeFlags TypeFlags) { @@ -10516,12 +10497,16 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, getSVEOverloadTypes(TypeFlags, Ty, Ops)); Value *Call = Builder.CreateCall(F, Ops); + if (Call->getType() == Ty) + return Call; + // Predicate results must be converted to svbool_t. - if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType())) - if (PredTy->getScalarType()->isIntegerTy(1)) - Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty)); + if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty)) + return EmitSVEPredicateCast(Call, PredTy); + if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty)) + return EmitSVEPredicateTupleCast(Call, PredTupleTy); - return FormSVEBuiltinResult(Call); + llvm_unreachable("unsupported element count!"); } switch (BuiltinID) { @@ -10853,9 +10838,8 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, TypeFlags.isOverloadNone() ? CGM.getIntrinsic(Builtin->LLVMIntrinsic) : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)}); - Value *Call = Builder.CreateCall(F, Ops); - return FormSVEBuiltinResult(Call); + return Builder.CreateCall(F, Ops); } Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 2df17e83bae2ee..c55ee4a28a6a99 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4645,6 +4645,8 @@ class CodeGenFunction : public CodeGenTypeCache { unsigned BuiltinID); llvm::Value *EmitSVEPredicateCast(llvm::Value *Pred, llvm::ScalableVectorType *VTy); + llvm::Value *EmitSVEPredicateTupleCast(llvm::Value *PredTuple, + llvm::StructType *Ty); llvm::Value *EmitSVEGatherLoad(const SVETypeFlags &TypeFlags, llvm::SmallVectorImpl<llvm::Value *> &Ops, unsigned IntID); @@ -4669,12 +4671,6 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags, SmallVectorImpl<llvm::Value *> &Ops, unsigned IntID); - /// FormSVEBuiltinResult - Returns the struct of scalable vectors as a wider - /// vector. It extracts the scalable vector from the struct and inserts into - /// the wider vector. This avoids the error when allocating space in llvm - /// for struct of scalable vectors if a function returns struct. - llvm::Value *FormSVEBuiltinResult(llvm::Value *Call); - llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags, diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c index 9c639984305d1b..1297185c4b50e0 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c @@ -24,27 +24,13 @@ // CHECK-LABEL: @test_svclamp_single_s8_x2( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16 // CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16) -// CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]] +// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] // // CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tS0_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16 // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0 -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16) -// CPP-CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]] +// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] // svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) __arm_streaming { return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3); @@ -52,27 +38,13 @@ svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) // CHECK-LABEL: @test_svclamp_single_s16_x2( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16 // CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8) -// CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]] +// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] // // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tS0_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16 // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0 -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8) -// CPP-CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]] +// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] // svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) __arm_streaming { return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3); @@ -80,27 +52,13 @@ svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t // CHECK-LABEL: @test_svclamp_single_s32_x2( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 // CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4) -// CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]] +// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]] // // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tS0_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0 -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4) -// CPP-CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]] +// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]] // svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) __arm_streaming { return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3); @@ -108,27 +66,13 @@ svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t // CHECK-LABEL: @test_svclamp_single_s64_x2( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16 // CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2) -// CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]] +// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]] // // CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tS0_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16 // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]]) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0 -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1 -// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2) -// CPP-CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16 -// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]] +// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]] // svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) __arm_streaming { return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3); @@ -139,35 +83,13 @@ svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t // CHECK-LABEL: @test_svclamp_single_s8_x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16 // CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0 -// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0) -// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1 -// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16) -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2 -// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32) -// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3 -// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48) -// CHECK-NEXT: store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]] +// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] // // CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tS0_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16 // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/109423 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits