llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-x86 Author: Ahmed Nour (ahmednoursphinx) <details> <summary>Changes</summary> This PR Resolves https://github.com/llvm/llvm-project/issues/165161 --- Full diff: https://github.com/llvm/llvm-project/pull/165513.diff 4 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (+4-1) - (modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+36-4) - (modified) clang/lib/AST/ExprConstant.cpp (+35-4) - (modified) clang/test/CodeGen/X86/sse41-builtins.c (+10) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 0c85e280e748b..a431fc36b41c1 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -327,8 +327,11 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW } } -let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; +} + +let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8f23001ea5a39..b1f0832860476 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3358,7 +3358,8 @@ static bool interp__builtin_x86_byteshift( static bool interp__builtin_ia32_shuffle_generic( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> - GetSourceIndex) { + GetSourceIndex, + llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) { assert(Call->getNumArgs() == 3); unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); @@ -3373,9 +3374,20 @@ static bool interp__builtin_ia32_shuffle_generic( const Pointer &Dst = S.Stk.peek<Pointer>(); for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { - auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const Pointer &Src = (SrcVecIdx == 0) ? A : B; - TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); }); + if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) { + // Zero out this element + if (ElemT == PT_Float) { + Dst.elem<Floating>(DstIdx) = Floating(S.getASTContext().getFloatTypeSemantics(VecT->getElementType())); + } else { + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + Dst.elem<T>(DstIdx) = T::from(0); + }); + } + } else { + auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); + const Pointer &Src = (SrcVecIdx == 0) ? A : B; + TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); }); + } } Dst.initializeAllElements(); @@ -4348,6 +4360,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index}; }); + case X86::BI__builtin_ia32_insertps128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, + [](unsigned DstIdx, unsigned Mask) { + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return std::pair<unsigned, unsigned>{1, SrcElem}; + } else { + // Copy from destination vector (A) + return std::pair<unsigned, unsigned>{0, DstIdx}; + } + }, + [](unsigned DstIdx, unsigned Mask) { + // Bits [3:0]: zero mask + return (Mask & (1 << DstIdx)) != 0; + }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 29ee089505125..17c966b8c9f4c 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11622,7 +11622,8 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, static bool evalShuffleGeneric( EvalInfo &Info, const CallExpr *Call, APValue &Out, llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> - GetSourceIndex) { + GetSourceIndex, + llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) { const auto *VT = Call->getType()->getAs<VectorType>(); if (!VT) @@ -11643,9 +11644,15 @@ static bool evalShuffleGeneric( ResultElements.reserve(NumElts); for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { - auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); - const APValue &Src = (SrcVecIdx == 0) ? A : B; - ResultElements.push_back(Src.getVectorElt(SrcIdx)); + if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) { + // Zero out this element + QualType ElemTy = VT->getElementType(); + ResultElements.push_back(APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy)))); + } else { + auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); + const APValue &Src = (SrcVecIdx == 0) ? A : B; + ResultElements.push_back(Src.getVectorElt(SrcIdx)); + } } Out = APValue(ResultElements.data(), ResultElements.size()); @@ -12481,6 +12488,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; return Success(R, E); } + case X86::BI__builtin_ia32_insertps128: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, unsigned> { + // Bits [7:6]: select element from source vector Y (0-3) + // Bits [5:4]: select destination position (0-3) + unsigned SrcElem = (Mask >> 6) & 0x3; + unsigned DstElem = (Mask >> 4) & 0x3; + if (DstIdx == DstElem) { + // Insert element from source vector (B) at this position + return {1, SrcElem}; + } else { + // Copy from destination vector (A) + return {0, DstIdx}; + } + }, + [](unsigned DstIdx, unsigned Mask) -> bool { + // Bits [3:0]: zero mask + return (Mask & (1 << DstIdx)) != 0; + })) + return false; + return Success(R, E); + } case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: { diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 62cd392824bb2..35fa65a99836b 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) { return _mm_insert_ps(x, y, 4); } +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3] +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all +TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all + __m128i test_mm_max_epi8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_max_epi8 // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) `````````` </details> https://github.com/llvm/llvm-project/pull/165513 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
