mike.dvoretsky created this revision. mike.dvoretsky added a reviewer: craig.topper. Herald added a subscriber: cfe-commits.
This patch lowers the _mm[256|512]_cvtepi{64|32|16}_epi{32|16|8} intrinsics to native IR in cases where the result's length is less than 128 bits. The resulting IR is folded into VPMOV instructions in https://reviews.llvm.org/D46957, with the exception of _mm_cvtepi64_epi8, where a PSHUFB instruction is currently produced instead. Repository: rC Clang https://reviews.llvm.org/D48712 Files: clang/lib/Headers/avx512vlbwintrin.h clang/lib/Headers/avx512vlintrin.h clang/test/CodeGen/avx512vl-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c
Index: clang/test/CodeGen/avx512vlbw-builtins.c =================================================================== --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -1792,7 +1792,8 @@ __m128i test_mm_cvtepi16_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi16_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 + // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> return _mm_cvtepi16_epi8(__A); } Index: clang/test/CodeGen/avx512vl-builtins.c =================================================================== --- clang/test/CodeGen/avx512vl-builtins.c +++ clang/test/CodeGen/avx512vl-builtins.c @@ -6974,7 +6974,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> return _mm_cvtepi32_epi8(__A); } @@ -6998,7 +6999,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.256 + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> return _mm256_cvtepi32_epi8(__A); } @@ -7022,7 +7024,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.dw.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> return _mm_cvtepi32_epi16(__A); } @@ -7070,7 +7073,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8> + // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> return _mm_cvtepi64_epi8(__A); } @@ -7094,7 +7098,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> return _mm256_cvtepi64_epi8(__A); } @@ -7118,7 +7123,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi32 - // CHECK: @llvm.x86.avx512.mask.pmov.qd.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> return _mm_cvtepi64_epi32(__A); } @@ -7168,7 +7174,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16> + // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3> return _mm_cvtepi64_epi16(__A); } @@ -7192,7 +7199,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> return _mm256_cvtepi64_epi16(__A); } Index: clang/lib/Headers/avx512vlintrin.h =================================================================== --- clang/lib/Headers/avx512vlintrin.h +++ clang/lib/Headers/avx512vlintrin.h @@ -30,6 +30,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"))) +typedef short __v2hi __attribute__((__vector_size__(4))); typedef char __v4qi __attribute__((__vector_size__(4))); typedef char __v2qi __attribute__((__vector_size__(2))); @@ -7415,10 +7416,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi8 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v4si)__A, __v4qi), - (__v4qi) {0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7, - 7, 7, 7, 7, 7, 7, 7, 7); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7446,10 +7446,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm256_cvtepi32_epi8 (__m256i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v8si)__A, __v8qi), - (__v8qi) {0, 0, 0, 0, 0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8si)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7476,9 +7476,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi16 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v4si)__A, __v4hi), - (__v4hi) {0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7532,10 +7532,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi64_epi8 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v2di)__A, __v2qi), - (__v2qi) {0, 0}, - 0, 1, 2, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7562,10 +7561,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm256_cvtepi64_epi8 (__m256i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v4di)__A, __v4qi), - (__v4qi) {0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7, - 7, 7, 7, 7, 7, 7, 7, 7); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7592,9 +7590,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi64_epi32 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v2di)__A, __v2si), - (__v2si) {0, 0}, - 0, 1, 2, 3); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7649,9 +7646,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi64_epi16 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v2di)__A, __v2hi), - (__v2hi) {0, 0}, - 0, 1, 2, 3, 3, 3, 3, 3); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7679,9 +7676,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm256_cvtepi64_epi16 (__m256i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v4di)__A, __v4hi), - (__v4hi) {0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS Index: clang/lib/Headers/avx512vlbwintrin.h =================================================================== --- clang/lib/Headers/avx512vlbwintrin.h +++ clang/lib/Headers/avx512vlbwintrin.h @@ -1495,10 +1495,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi8 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v8hi)__A, __v8qi), - (__v8qi) {0, 0, 0, 0, 0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8hi)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } static __inline__ __m128i __DEFAULT_FN_ATTRS
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits