https://github.com/chaitanyav updated https://github.com/llvm/llvm-project/pull/168995
>From 4a5b92c667bc8a2a2346463bd34041c280213aee Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki <[email protected]> Date: Thu, 20 Nov 2025 16:42:22 -0800 Subject: [PATCH] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX512 VPMULTISHIFTQB intrinsics to be used in constexpr Resolves:#167477 --- clang/include/clang/Basic/BuiltinsX86.td | 6 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 63 +++++++++ clang/lib/AST/ExprConstant.cpp | 45 ++++++ clang/lib/Headers/avx512vbmiintrin.h | 38 +++-- clang/lib/Headers/avx512vbmivlintrin.h | 72 +++++----- clang/test/CodeGen/X86/avx512vbmi-builtins.c | 133 +++++++++++++++++- clang/test/CodeGen/X86/avx512vbmivl-builtin.c | 94 ++++++++++++- 7 files changed, 378 insertions(+), 73 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index b760c3e06b8f7..93fb511a508f3 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -3358,15 +3358,15 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128> def cvtusi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned int, _Constant int)">; } -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpmultishiftqb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpmultishiftqb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpmultishiftqb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index eba71d66bc4d6..5be31239fd597 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3468,6 +3468,65 @@ static bool interp__builtin_ia32_shuffle_generic( return true; } +static bool interp__builtin_ia32_multishiftqb(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + assert(Call->getNumArgs() == 2); + + QualType ATy = Call->getArg(0)->getType(); + QualType BTy = Call->getArg(1)->getType(); + if (!ATy->isVectorType() || !BTy->isVectorType()) { + return false; + } + + const Pointer &BPtr = S.Stk.pop<Pointer>(); + const Pointer &APtr = S.Stk.pop<Pointer>(); + const auto *AVecT = ATy->castAs<VectorType>(); + const auto *BVecT = BTy->castAs<VectorType>(); + assert(AVecT->getNumElements() == BVecT->getNumElements()); + + PrimType ElemT = *S.getContext().classify(AVecT->getElementType()); + + unsigned NumBytesInQWord = 8; + unsigned NumBitsInByte = 8; + unsigned NumBytes = AVecT->getNumElements(); + unsigned NumQWords = NumBytes / NumBytesInQWord; + const Pointer &Dst = S.Stk.peek<Pointer>(); + + for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) { + APInt AQWord(64, 0); + APInt BQWord(64, 0); + for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) { + unsigned Idx = QWordId * NumBytesInQWord + ByteIdx; + uint64_t Byte = 0; + INT_TYPE_SWITCH(ElemT, { + Byte = static_cast<uint64_t>(APtr.elem<T>(Idx)); + AQWord.insertBits(APInt(8, Byte & 0xFF), ByteIdx * NumBitsInByte); + + Byte = static_cast<uint64_t>(BPtr.elem<T>(Idx)); + BQWord.insertBits(APInt(8, Byte & 0xFF), ByteIdx * NumBitsInByte); + }); + } + + for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) { + uint64_t Ctrl = + AQWord.extractBits(8, ByteIdx * NumBitsInByte).getZExtValue() & 0x3F; + + APInt Byte(8, 0); + for (unsigned BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) { + Byte.insertBits(BQWord.extractBits(1, (Ctrl + BitIdx) & 0x3F), BitIdx); + } + INT_TYPE_SWITCH(ElemT, { + Dst.elem<T>(QWordId * NumBytesInQWord + ByteIdx) = + T::from(Byte.getZExtValue()); + }); + } + } + + Dst.initializeAllElements(); + + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) @@ -4669,6 +4728,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::make_pair(0, static_cast<int>(LaneOffset + Index)); }); + case X86::BI__builtin_ia32_vpmultishiftqb128: + case X86::BI__builtin_ia32_vpmultishiftqb256: + case X86::BI__builtin_ia32_vpmultishiftqb512: + return interp__builtin_ia32_multishiftqb(S, OpPC, Call); case X86::BI__builtin_ia32_kandqi: case X86::BI__builtin_ia32_kandhi: case X86::BI__builtin_ia32_kandsi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ce5301f17b3e7..21a664fefde49 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13062,6 +13062,51 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(R, E); } + case X86::BI__builtin_ia32_vpmultishiftqb128: + case X86::BI__builtin_ia32_vpmultishiftqb256: + case X86::BI__builtin_ia32_vpmultishiftqb512: { + assert(E->getNumArgs() == 2); + + APValue A, B; + if (!Evaluate(A, Info, E->getArg(0)) || !Evaluate(B, Info, E->getArg(1))) + return false; + + assert(A.getVectorLength() == B.getVectorLength()); + unsigned NumBytesInQWord = 8; + unsigned NumBitsInByte = 8; + unsigned NumBytes = A.getVectorLength(); + unsigned NumQWords = NumBytes / NumBytesInQWord; + SmallVector<APValue, 64> Result; + Result.reserve(NumBytes); + + for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) { + APInt AQWord(64, 0); + APInt BQWord(64, 0); + for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) { + unsigned Idx = QWordId * NumBytesInQWord + ByteIdx; + uint64_t Byte = A.getVectorElt(Idx).getInt().getZExtValue(); + AQWord.insertBits(APInt(8, Byte & 0xFF), ByteIdx * NumBitsInByte); + + Byte = B.getVectorElt(Idx).getInt().getZExtValue(); + BQWord.insertBits(APInt(8, Byte & 0xFF), ByteIdx * NumBitsInByte); + } + + for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) { + uint64_t Ctrl = + AQWord.extractBits(8, ByteIdx * NumBitsInByte).getZExtValue() & + 0x3F; + + APInt Byte(8, 0); + for (unsigned BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) { + Byte.insertBits(BQWord.extractBits(1, (Ctrl + BitIdx) & 0x3F), + BitIdx); + } + Result.push_back(APValue(APSInt(Byte, /*isUnsigned*/ true))); + } + } + return Success(APValue(Result.data(), Result.size()), E); + } + case X86::BI__builtin_ia32_phminposuw128: { APValue Source; if (!Evaluate(Source, Info, E->getArg(0))) diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index 84fda5c5849e8..5ac78f0849c26 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -15,61 +15,57 @@ #define __VBMIINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ - __min_vector_width__(512))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr + __min_vector_width__(512))) constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ + __min_vector_width__(512))) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutex2var_epi8( + __m512i __A, __mmask64 __U, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_epi8( + __m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutex2var_epi8( + __mmask64 __U, __m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi8( + __m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W); @@ -97,6 +93,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), (__v64qi)_mm512_setzero_si512()); } -#undef __DEFAULT_FN_ATTRS_CONSTEXPR + #undef __DEFAULT_FN_ATTRS #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 58a48dadff863..40a67bd63ca49 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -15,6 +15,16 @@ #define __VBMIVLINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vbmi,avx512vl"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vbmi,avx512vl"), \ + __min_vector_width__(256))) constexpr +#else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vbmi,avx512vl"), \ @@ -23,111 +33,96 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vbmi,avx512vl"), \ __min_vector_width__(256))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr -#else -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 #endif -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, (__v16qi)__I, (__v16qi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi8( + __m128i __A, __mmask16 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi8( + __m128i __A, __m128i __I, __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi8( + __mmask16 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi8( + __m256i __A, __mmask32 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi8( + __m256i __A, __m256i __I, __mmask32 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi8( + __mmask32 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi8( + __m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi8( + __m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W); @@ -179,9 +174,6 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) (__v32qi)_mm256_setzero_si256()); } -#undef __DEFAULT_FN_ATTRS128_CONSTEXPR -#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - #endif diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c index 7d506db92faeb..fcce58b63737b 100644 --- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c +++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c @@ -211,18 +211,145 @@ __m512i test_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512 // CHECK-LABEL: test_mm512_mask_multishift_epi64_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_mask_multishift_epi64_epi8(__W, __M, __X, __Y); + return _mm512_mask_multishift_epi64_epi8(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v64qu( + _mm512_mask_multishift_epi64_epi8( + (__m512i)(__v64qu){ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + 0xAAAAAAAAAAAAAAAAULL, + (__m512i)(__v64qu){ + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m512i)(__v64qu){ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}), + 0xFF, 0x02, 0xFF, 0x04, 0xFF, 0x06, 0xFF, 0x08, + 0xFF, 0x12, 0xFF, 0x14, 0xFF, 0x16, 0xFF, 0x18, + 0xFF, 0x22, 0xFF, 0x24, 0xFF, 0x26, 0xFF, 0x28, + 0xFF, 0x32, 0xFF, 0x34, 0xFF, 0x36, 0xFF, 0x38, + 0xFF, 0x42, 0xFF, 0x44, 0xFF, 0x46, 0xFF, 0x48, + 0xFF, 0x52, 0xFF, 0x54, 0xFF, 0x56, 0xFF, 0x58, + 0xFF, 0x62, 0xFF, 0x64, 0xFF, 0x66, 0xFF, 0x68, + 0xFF, 0x72, 0xFF, 0x74, 0xFF, 0x76, 0xFF, 0x78)); + __m512i test_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_maskz_multishift_epi64_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} - return _mm512_maskz_multishift_epi64_epi8(__M, __X, __Y); + return _mm512_maskz_multishift_epi64_epi8(__M, __X, __Y); } +TEST_CONSTEXPR(match_v64qu( + _mm512_maskz_multishift_epi64_epi8( + 0x5555555555555555ULL, + (__m512i)(__v64qu){ + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m512i)(__v64qu){ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}), + 0x01, 0, 0x03, 0, 0x05, 0, 0x07, 0, + 0x11, 0, 0x13, 0, 0x15, 0, 0x17, 0, + 0x21, 0, 0x23, 0, 0x25, 0, 0x27, 0, + 0x31, 0, 0x33, 0, 0x35, 0, 0x37, 0, + 0x41, 0, 0x43, 0, 0x45, 0, 0x47, 0, + 0x51, 0, 0x53, 0, 0x55, 0, 0x57, 0, + 0x61, 0, 0x63, 0, 0x65, 0, 0x67, 0, + 0x71, 0, 0x73, 0, 0x75, 0, 0x77, 0)); + __m512i test_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_multishift_epi64_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) - return _mm512_multishift_epi64_epi8(__X, __Y); + return _mm512_multishift_epi64_epi8(__X, __Y); } + +TEST_CONSTEXPR(match_v64qu( + _mm512_multishift_epi64_epi8( + (__m512i)(__v64qu){ + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m512i)(__v64qu){ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}), + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78)); + +TEST_CONSTEXPR(match_v64qu( + _mm512_multishift_epi64_epi8( + (__m512i)(__v64qu){ + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4}, + (__m512i)(__v64qu){ + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE}), + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21)); diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c index 49b7a1a721195..f55ab4154c752 100644 --- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c +++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c @@ -166,39 +166,121 @@ __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i _ // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_mask_multishift_epi64_epi8(__W, __M, __X, __Y); + return _mm_mask_multishift_epi64_epi8(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v16qu( + _mm_mask_multishift_epi64_epi8( + (__m128i)(__v16qu){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + 0xAAAA, + (__m128i)(__v16qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m128i)(__v16qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18}), + 0xFF, 0x02, 0xFF, 0x04, 0xFF, 0x06, 0xFF, 0x08, + 0xFF, 0x12, 0xFF, 0x14, 0xFF, 0x16, 0xFF, 0x18)); + __m128i test_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_maskz_multishift_epi64_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_multishift_epi64_epi8(__M, __X, __Y); + return _mm_maskz_multishift_epi64_epi8(__M, __X, __Y); } +TEST_CONSTEXPR(match_v16qu( + _mm_maskz_multishift_epi64_epi8( + 0x5555, + (__m128i)(__v16qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m128i)(__v16qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18}), + 0x01, 0, 0x03, 0, 0x05, 0, 0x07, 0, + 0x11, 0, 0x13, 0, 0x15, 0, 0x17, 0)); + __m128i test_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_multishift_epi64_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - return _mm_multishift_epi64_epi8(__X, __Y); + return _mm_multishift_epi64_epi8(__X, __Y); } +TEST_CONSTEXPR(match_v16qu( + _mm_multishift_epi64_epi8( + (__m128i)(__v16qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m128i)(__v16qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18}), + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18)); + __m256i test_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_mask_multishift_epi64_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_mask_multishift_epi64_epi8(__W, __M, __X, __Y); + return _mm256_mask_multishift_epi64_epi8(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v32qu( + _mm256_mask_multishift_epi64_epi8( + (__m256i)(__v32qu){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + 0xAAAAAAAA, + (__m256i)(__v32qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m256i)(__v32qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38}), + 0xFF, 0x02, 0xFF, 0x04, 0xFF, 0x06, 0xFF, 0x08, + 0xFF, 0x12, 0xFF, 0x14, 0xFF, 0x16, 0xFF, 0x18, + 0xFF, 0x22, 0xFF, 0x24, 0xFF, 0x26, 0xFF, 0x28, + 0xFF, 0x32, 0xFF, 0x34, 0xFF, 0x36, 0xFF, 0x38)); + __m256i test_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_maskz_multishift_epi64_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_multishift_epi64_epi8(__M, __X, __Y); + return _mm256_maskz_multishift_epi64_epi8(__M, __X, __Y); } +TEST_CONSTEXPR(match_v32qu( + _mm256_maskz_multishift_epi64_epi8( + 0x55555555, + (__m256i)(__v32qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m256i)(__v32qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38}), + 0x01, 0, 0x03, 0, 0x05, 0, 0x07, 0, + 0x11, 0, 0x13, 0, 0x15, 0, 0x17, 0, + 0x21, 0, 0x23, 0, 0x25, 0, 0x27, 0, + 0x31, 0, 0x33, 0, 0x35, 0, 0x37, 0)); + __m256i test_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_multishift_epi64_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) - return _mm256_multishift_epi64_epi8(__X, __Y); + return _mm256_multishift_epi64_epi8(__X, __Y); } +TEST_CONSTEXPR(match_v32qu( + _mm256_multishift_epi64_epi8( + (__m256i)(__v32qu){0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56, + 0, 8, 16, 24, 32, 40, 48, 56}, + (__m256i)(__v32qu){0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38}), + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38)); + _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
