[PATCH] D38231: fixing a bug in mask[z]_set1
This revision was automatically updated to reflect the committed changes. Closed by commit rL314102: fixing a bug in mask[z]_set1 intrinsic (authored by jina.nahias). Changed prior to commit: https://reviews.llvm.org/D38231?vs=116513&id=116545#toc Repository: rL LLVM https://reviews.llvm.org/D38231 Files: cfe/trunk/lib/Headers/avx512vlintrin.h cfe/trunk/test/CodeGen/avx512vl-builtins.c Index: cfe/trunk/lib/Headers/avx512vlintrin.h === --- cfe/trunk/lib/Headers/avx512vlintrin.h +++ cfe/trunk/lib/Headers/avx512vlintrin.h @@ -5761,15 +5761,15 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64x(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64x(__A), (__v2di) _mm_setzero_si128()); } Index: cfe/trunk/test/CodeGen/avx512vl-builtins.c === --- cfe/trunk/test/CodeGen/avx512vl-builtins.c +++ cfe/trunk/test/CodeGen/avx512vl-builtins.c @@ -4563,45 +4563,17 @@ #ifdef __x86_64__ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_mask_set1_epi64 - // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_set1_epi64(__O, __M, __A); } __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_maskz_set1_epi64 - // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_set1_epi64(__M, __A); Index: cfe/trunk/lib/Headers/avx512vlintrin.h === --- cfe/trunk/lib/Headers/avx512vlintrin.h +++ cfe/trunk/lib/Headers/avx512vlintrin.h @@ -5761,15 +5761,15 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), +
[PATCH] D38672: lowering shuffle f/i intrinsic - clang part
jina.nahias created this revision. https://reviews.llvm.org/D38672 Files: lib/Headers/avx512fintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -5630,73 +5630,85 @@ } __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}1, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}},
[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part
jina.nahias updated this revision to Diff 118166. https://reviews.llvm.org/D38672 Files: lib/Headers/avx512fintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -5630,73 +5630,85 @@ } __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*
[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part
jina.nahias created this revision. https://reviews.llvm.org/D38683 Files: include/clang/Basic/BuiltinsX86.def lib/Headers/avx512cdintrin.h lib/Headers/avx512vlcdintrin.h test/CodeGen/avx512cdintrin.c test/CodeGen/avx512vlcd-builtins.c Index: test/CodeGen/avx512vlcd-builtins.c === --- test/CodeGen/avx512vlcd-builtins.c +++ test/CodeGen/avx512vlcd-builtins.c @@ -3,28 +3,56 @@ #include -__m128i test_mm_broadcastmb_epi64(__mmask8 __A) { +__m128i test_mm_broadcastmb_epi64(__m128i a,__m128i b) { // CHECK-LABEL: @test_mm_broadcastmb_epi64 - // CHECK: @llvm.x86.avx512.broadcastmb.128 - return _mm_broadcastmb_epi64(__A); + // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}} + // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> + // CHECK: bitcast <8 x i1> %{{.*}} to i8 + // CHECK: zext i8 %{{.*}} to i64 + // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 + return _mm_broadcastmb_epi64(_mm_cmpeq_epi32_mask (a, b)); } -__m256i test_mm256_broadcastmb_epi64(__mmask8 __A) { +__m256i test_mm256_broadcastmb_epi64(__m256i a, __m256i b) { // CHECK-LABEL: @test_mm256_broadcastmb_epi64 - // CHECK: @llvm.x86.avx512.broadcastmb.256 - return _mm256_broadcastmb_epi64(__A); -} - -__m128i test_mm_broadcastmw_epi32(__mmask16 __A) { + // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}} + // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> + // CHECK: bitcast <8 x i1> %{{.*}} to i8 + // CHECK: zext i8 %{{.*}} to i64 + // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1 + // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2 + // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3 + return _mm256_broadcastmb_epi64(_mm256_cmpeq_epi64_mask ( a, b)); +} + +__m128i test_mm_broadcastmw_epi32(__m512i a, __m512i b) { // CHECK-LABEL: @test_mm_broadcastmw_epi32 - // CHECK: @llvm.x86.avx512.broadcastmw.128 - return _mm_broadcastmw_epi32(__A); + // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}} + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: zext i16 %{{.*}} to i32 + // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3 + return _mm_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); } -__m256i test_mm256_broadcastmw_epi32(__mmask16 __A) { +__m256i test_mm256_broadcastmw_epi32(__m512i a, __m512i b) { // CHECK-LABEL: @test_mm256_broadcastmw_epi32 - // CHECK: @llvm.x86.avx512.broadcastmw.256 - return _mm256_broadcastmw_epi32(__A); + // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}} + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: zext i16 %{{.*}} to i32 + // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6 + // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7 + return _mm256_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); } __m128i test_mm_conflict_epi64(__m128i __A) { Index: test/CodeGen/avx512cdintrin.c === --- test/CodeGen/avx512cdintrin.c +++ test/CodeGen/avx512cdintrin.c @@ -68,14 +68,40 @@ return _mm512_maskz_lzcnt_epi64(__U,__A); } -__m512i test_mm512_broadcastmb_epi64(__mmask8 __A) { +__m512i test_mm512_broadcastmb_epi64(__m512i a, __m512i b) { // CHECK-LABEL: @test_mm512_broadcastmb_epi64 - // CHECK: @llvm.x86.avx512.broadcastmb.512 - return _mm512_broadcastmb_epi64(__A); + // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}} + // CHECK: zext i8 %{{.*}} to i64 + // CHECK: insertelement <8 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 1 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 2 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 3 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 4 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 5 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 6 + // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 7 + return _mm512_broadcastmb_epi64(_mm512_cmpeq_epu64_mask ( a, b)); } -__m512i test_mm512_broadcastmw_epi32(__mmask16 __A) { +__m512i test_mm512_broadcastmw_epi32(__m512i a, __m512i b) { // CHECK-LABEL: @test_mm512_broadcastmw_epi32 - // CHECK: @
[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part
jina.nahias added inline comments. Comment at: test/CodeGen/avx512cdintrin.c:106 + // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}} + return _mm512_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); } RKSimon wrote: > Any reason why you can't use the actual insertion indices instead of regexps? yes, the cmpeq intrinsic returns a mask type. if we wont send a mask type to the broadcastm intrinsic it would do a regular broadcast (not a mask broadcast). https://reviews.llvm.org/D38683 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias created this revision. this is clang part , the llvm part is https://reviews.llvm.org/differential/diff/114515/ https://reviews.llvm.org/D37668 Files: lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -2602,28 +2602,196 @@ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } + __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ +// CHECK-LABEL: @test_mm_mask_set1_epi8 +// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_mask_set1_epi8(__O, __M, __A); + } + + __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ +// CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_set1_epi8( __M, __A); + } + +__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + // CHECK-LABEL: @test_mm256_mask_set1_epi8 +// CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22 + // CHECK: in
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias updated this revision to Diff 114594. jina.nahias added a comment. delete from include/clang/Basic/BuiltinsX86.def and include/clang/Basic/BuiltinsX86_64.def https://reviews.llvm.org/D37668 Files: include/clang/Basic/BuiltinsX86.def include/clang/Basic/BuiltinsX86_64.def Index: include/clang/Basic/BuiltinsX86_64.def === --- include/clang/Basic/BuiltinsX86_64.def +++ include/clang/Basic/BuiltinsX86_64.def @@ -71,9 +71,6 @@ TARGET_BUILTIN(__builtin_ia32_bextri_u64, "ULLiULLiIULLi", "", "tbm") TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcULLiUiUi", "", "lwp") TARGET_BUILTIN(__builtin_ia32_lwpval64, "vULLiUiUi", "", "lwp") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_gpr_mask, "V8LLiLLiV8LLiUc", "", "avx512f") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq128_gpr_mask, "V2LLiULLiV2LLiUc","","avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq256_gpr_mask, "V4LLiULLiV4LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi","","avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi","","avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi","","avx512f") Index: include/clang/Basic/BuiltinsX86.def === --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -977,7 +977,6 @@ TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8LLiV16iV16i", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_ptestmd512, "UsV16iV16iUs", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_ptestmq512, "UcV8LLiV8LLiUc", "", "avx512f") -TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8LLiLLiC*V8LLiUc", "", "avx512f") @@ -1381,11 +1380,6 @@ TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4LLiV4LLiC*V4LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2LLi*V2LLiUc","","avx512f") TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4LLi*V4LLiUc","","avx512f") -TARGET_BUILTIN(__builtin_ia32_pbroadcastb512_gpr_mask, "V64ccV64cULLi","","avx512bw") -TARGET_BUILTIN(__builtin_ia32_pbroadcastb128_gpr_mask, "V16ccV16cUs","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastb256_gpr_mask, "V32ccV32cUi","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastd128_gpr_mask, "V4iiV4iUc","","avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastd256_gpr_mask, "V8iiV8iUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma") TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma") TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma") @@ -1596,9 +1590,6 @@ TARGET_BUILTIN(__builtin_ia32_broadcastmb256, "V4LLiUc","","avx512cd,avx512vl") TARGET_BUILTIN(__builtin_ia32_broadcastmw128, "V4iUs","","avx512cd,avx512vl") TARGET_BUILTIN(__builtin_ia32_broadcastmw256, "V8iUs","","avx512cd,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastw512_gpr_mask, "V32shV32sUi","","avx512bw") -TARGET_BUILTIN(__builtin_ia32_pbroadcastw256_gpr_mask, "V16shV16sUs","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastw128_gpr_mask, "V8ssV8sUc","","avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi","","avx512bw") Index: include/clang/Basic/BuiltinsX86_64.def === --- include/clang/Basic/BuiltinsX86_64.def +++ include/clang/Basic/BuiltinsX86_64.def @@ -71,9 +71,6 @@ TARGET_BUILTIN(__builtin_ia32_bextri_u64, "ULLiULLiIULLi", "", "tbm") TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcULLiUiUi", "", "lwp") TARGET_BUILTIN(__builtin_ia32_lwpval64, "vULLiUiUi", "", "lwp") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_gpr_mask, "V8LLiLLiV8LLiUc", "", "avx512f") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq128_gpr_mask, "V2LLiULLiV2LLiUc","","avx512vl") -TARGET_BUILTIN(__builtin_ia32_pbroadcastq256_gpr_mask, "V4LLiULLiV4LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi","","avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi","","avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi","","avx512f") Index: include/clang/Basic/BuiltinsX86.def === --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -977,7 +977,6 @@ TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8LLiV16iV16i", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_ptestmd
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias updated this revision to Diff 114765. https://reviews.llvm.org/D37668 Files: include/clang/Basic/BuiltinsX86.def include/clang/Basic/BuiltinsX86_64.def lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -2602,28 +2602,196 @@ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } + __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ +// CHECK-LABEL: @test_mm_mask_set1_epi8 +// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_mask_set1_epi8(__O, __M, __A); + } + + __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ +// CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 +// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 +// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_set1_epi8( __M, __A); + } + +__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + // CHECK-LABEL: @test_mm256_mask_set1_epi8 +// CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22 + // CHECK:
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias updated this revision to Diff 114836. https://reviews.llvm.org/D37668 Files: include/clang/Basic/BuiltinsX86.def include/clang/Basic/BuiltinsX86_64.def lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -2602,28 +2602,195 @@ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } +__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_mask_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_mask_set1_epi8(__O, __M, __A); +} +__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_set1_epi8( __M, __A); +} + +__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + // CHECK-LABEL: @test_mm256_mask_set1_epi8 + // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23 + // CHECK: insertelemen
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias updated this revision to Diff 114978. https://reviews.llvm.org/D37668 Files: include/clang/Basic/BuiltinsX86.def include/clang/Basic/BuiltinsX86_64.def lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -2602,28 +2602,195 @@ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } +__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_mask_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_mask_set1_epi8(__O, __M, __A); +} +__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_set1_epi8( __M, __A); +} + +__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + // CHECK-LABEL: @test_mm256_mask_set1_epi8 + // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23 + // CHECK: insertelemen
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias added inline comments. Comment at: include/clang/Basic/BuiltinsX86.def:981 -TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", "avx512f") TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", "avx512f") craig.topper wrote: > I think you patch removed the only use of > __builtin_ia32_pbroadcastq512_mem_mask right? Does your change work properly > in 32-bit mode? yes for both of the questions, i have deleted __builtin_ia32_pbroadcastq512_mem_mask you can see it in the new update. https://reviews.llvm.org/D37668 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias updated this revision to Diff 115622. jina.nahias added a comment. rebase on @craig.topper commit. https://reviews.llvm.org/D37668 Files: include/clang/Basic/BuiltinsX86.def include/clang/Basic/BuiltinsX86_64.def lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -2670,28 +2670,195 @@ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } +__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_mask_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_mask_set1_epi8(__O, __M, __A); +} +__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ + // CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_set1_epi8( __M, __A); +} + +__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + // CHECK-LABEL: @test_mm256_mask_set1_epi8 + // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21 + // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22 + // CHECK: insertelement
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
jina.nahias added inline comments. Comment at: lib/Headers/avx512fintrin.h:9742 #ifdef __x86_64__ static __inline__ __m512i __DEFAULT_FN_ATTRS craig.topper wrote: > Please remove the #ifdef __x86_64__ from this. It should work in 32-bits as > well. the current generated code for 32-bit is not optimal, 32-bit needs some more work which will be in a following patch . https://reviews.llvm.org/D37668 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR
This revision was automatically updated to reflect the committed changes. Closed by commit rL313624: Lowering Mask Set1 intrinsics to LLVM IR (authored by jina.nahias). Changed prior to commit: https://reviews.llvm.org/D37668?vs=115622&id=115823#toc Repository: rL LLVM https://reviews.llvm.org/D37668 Files: cfe/trunk/include/clang/Basic/BuiltinsX86.def cfe/trunk/include/clang/Basic/BuiltinsX86_64.def cfe/trunk/lib/Headers/avx512bwintrin.h cfe/trunk/lib/Headers/avx512fintrin.h cfe/trunk/lib/Headers/avx512vlbwintrin.h cfe/trunk/lib/Headers/avx512vlintrin.h cfe/trunk/test/CodeGen/avx512bw-builtins.c cfe/trunk/test/CodeGen/avx512f-builtins.c cfe/trunk/test/CodeGen/avx512vl-builtins.c cfe/trunk/test/CodeGen/avx512vlbw-builtins.c Index: cfe/trunk/lib/Headers/avx512vlintrin.h === --- cfe/trunk/lib/Headers/avx512vlintrin.h +++ cfe/trunk/lib/Headers/avx512vlintrin.h @@ -5723,59 +5723,72 @@ (__v4df)_mm256_setzero_pd()); } +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)__O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)__O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} -#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)(__m128i)(O), \ - (__mmask8)(M)); }) - -#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(M)); }) - -#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)(__m256i)(O), \ - (__mmask8)(M)); }) - -#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(M)); }) #ifdef __x86_64__ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi8(__A), + (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, - __M); + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) __O) ; } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_i
[PATCH] D38231: fixing a bug in mask[z]_set1
jina.nahias created this revision. https://reviews.llvm.org/D38231 Files: lib/Headers/avx512vlintrin.h Index: lib/Headers/avx512vlintrin.h === --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -5761,15 +5761,15 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64(__A), (__v2di) _mm_setzero_si128()); } Index: lib/Headers/avx512vlintrin.h === --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -5761,15 +5761,15 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64(__A), (__v2di) _mm_setzero_si128()); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38231: fixing a bug in mask[z]_set1
jina.nahias updated this revision to Diff 116513. https://reviews.llvm.org/D38231 Files: lib/Headers/avx512vlintrin.h test/CodeGen/avx512vl-builtins.c Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -4563,45 +4563,17 @@ #ifdef __x86_64__ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_mask_set1_epi64 - // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_set1_epi64(__O, __M, __A); } __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_maskz_set1_epi64 - // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15 + // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0 + // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_set1_epi64(__M, __A); Index: lib/Headers/avx512vlintrin.h === --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -5761,15 +5761,15 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64x(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi8(__A), + (__v2di) _mm_set1_epi64x(__A), (__v2di) _mm_setzero_si128()); } Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -4563,45 +4563,17 @@ #ifdef __x86_64__ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_mask_set1_epi64 - // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4 - // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5 - // CHECK: insertelemen
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias added a comment. ping https://reviews.llvm.org/D39719 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
This revision was automatically updated to reflect the committed changes. Closed by commit rL319777: [x86][AVX512] Lowering kunpack intrinsics to LLVM IR (authored by jina.nahias). Changed prior to commit: https://reviews.llvm.org/D39719?vs=122617&id=125525#toc Repository: rL LLVM https://reviews.llvm.org/D39719 Files: cfe/trunk/lib/Headers/avx512bwintrin.h cfe/trunk/lib/Headers/avx512fintrin.h cfe/trunk/test/CodeGen/avx512bw-builtins.c cfe/trunk/test/CodeGen/avx512f-builtins.c Index: cfe/trunk/lib/Headers/avx512fintrin.h === --- cfe/trunk/lib/Headers/avx512fintrin.h +++ cfe/trunk/lib/Headers/avx512fintrin.h @@ -8787,7 +8787,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: cfe/trunk/lib/Headers/avx512bwintrin.h === --- cfe/trunk/lib/Headers/avx512bwintrin.h +++ cfe/trunk/lib/Headers/avx512bwintrin.h @@ -1854,15 +1854,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __A & 0x) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, -(__mmask32) __B); +return (__mmask32) (( __A & 0x) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: cfe/trunk/test/CodeGen/avx512bw-builtins.c === --- cfe/trunk/test/CodeGen/avx512bw-builtins.c +++ cfe/trunk/test/CodeGen/avx512bw-builtins.c @@ -1626,16 +1626,26 @@ return _mm512_maskz_set1_epi8(__M, __A); } -__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: @llvm.x86.avx512.kunpck.dq - return _mm512_kunpackd(__A, __B); + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: and i64 %{{.*}}, 4294967295 + // CHECK: shl i64 %{{.*}}, 32 + // CHECK: or i64 %{{.*}}, %{{.*}} + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } -__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: @llvm.x86.avx512.kunpck.wd - return _mm512_kunpackw(__A, __B); + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 65535 + // CHECK: shl i32 %{{.*}}, 16 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { Index: cfe/trunk/test/CodeGen/avx512f-builtins.c === --- cfe/trunk/test/CodeGen/avx512f-builtins.c +++ cfe/trunk/test/CodeGen/avx512f-builtins.c @@ -6241,10 +6241,17 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B), + _mm512_cmpneq_epu32_mask(__C, __D)), + __E, __F); } __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) { Index: cfe/trunk/lib/Headers/avx512fintrin.h === --- cfe/trunk/lib/Headers/avx512fintrin.h +++ cfe/trunk/lib/Headers/avx512fintrin.h @@ -8787,7 +8787,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { -
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
This revision was automatically updated to reflect the committed changes. Closed by commit rC319777: [x86][AVX512] Lowering kunpack intrinsics to LLVM IR (authored by jina.nahias). Repository: rC Clang https://reviews.llvm.org/D39719 Files: lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c Index: lib/Headers/avx512bwintrin.h === --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -1854,15 +1854,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __A & 0x) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, -(__mmask32) __B); +return (__mmask32) (( __A & 0x) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: lib/Headers/avx512fintrin.h === --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -8787,7 +8787,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6241,10 +6241,17 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B), + _mm512_cmpneq_epu32_mask(__C, __D)), + __E, __F); } __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) { Index: test/CodeGen/avx512bw-builtins.c === --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1626,16 +1626,26 @@ return _mm512_maskz_set1_epi8(__M, __A); } -__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: @llvm.x86.avx512.kunpck.dq - return _mm512_kunpackd(__A, __B); + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: and i64 %{{.*}}, 4294967295 + // CHECK: shl i64 %{{.*}}, 32 + // CHECK: or i64 %{{.*}}, %{{.*}} + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } -__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: @llvm.x86.avx512.kunpck.wd - return _mm512_kunpackw(__A, __B); + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 65535 + // CHECK: shl i32 %{{.*}}, 16 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { Index: lib/Headers/avx512bwintrin.h === --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -1854,15 +1854,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __A & 0x) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - ret
[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part
jina.nahias added a comment. commit in https://reviews.llvm.org/rL317456 https://reviews.llvm.org/D38683 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias created this revision. https://reviews.llvm.org/D39719 Files: lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,15 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A,__B),_mm512_cmpneq_epu32_mask(__C,__D)),__E, __F); } __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) { Index: test/CodeGen/avx512bw-builtins.c === --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1626,16 +1626,26 @@ return _mm512_maskz_set1_epi8(__M, __A); } -__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: @llvm.x86.avx512.kunpck.dq - return _mm512_kunpackd(__A, __B); + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: and i64 %{{.*}}, 4294967295 + // CHECK: shl i64 %{{.*}}, 32 + // CHECK: or i64 %{{.*}}, %{{.*}} + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } -__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: @llvm.x86.avx512.kunpck.wd - return _mm512_kunpackw(__A, __B); + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 65535 + // CHECK: shl i32 %{{.*}}, 16 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { Index: lib/Headers/avx512fintrin.h === --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -9011,7 +9011,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __B & 0xFF) | ((__mmask16) __A << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: lib/Headers/avx512bwintrin.h === --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -2042,15 +2042,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __B & 0x) | ((__mmask64) __A << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, -(__mmask32) __B); +return (__mmask32) (( __B & 0x) | ((__mmask32) __A << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,15 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias updated this revision to Diff 121899. https://reviews.llvm.org/D39719 Files: lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,15 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A,__B),_mm512_cmpneq_epu32_mask(__C,__D)),__E, __F); } __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) { Index: test/CodeGen/avx512bw-builtins.c === --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1626,16 +1626,26 @@ return _mm512_maskz_set1_epi8(__M, __A); } -__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: @llvm.x86.avx512.kunpck.dq - return _mm512_kunpackd(__A, __B); + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: and i64 %{{.*}}, 4294967295 + // CHECK: shl i64 %{{.*}}, 32 + // CHECK: or i64 %{{.*}}, %{{.*}} + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } -__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: @llvm.x86.avx512.kunpck.wd - return _mm512_kunpackw(__A, __B); + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 65535 + // CHECK: shl i32 %{{.*}}, 16 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { Index: lib/Headers/avx512fintrin.h === --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -9011,7 +9011,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: lib/Headers/avx512bwintrin.h === --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -2042,15 +2042,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __A & 0x) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, -(__mmask32) __B); +return (__mmask32) (( __A & 0x) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,15 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_m
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias added inline comments. Comment at: lib/Headers/avx512bwintrin.h:2045 { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __B & 0x) | ((__mmask64) __A << 32)); } RKSimon wrote: > Is this right? The Intel docs says it should be: > ``` > k[31:0] := a[31:0] > k[63:32] := b[31:0] > k[MAX:64] := 0 > ``` > > Also, is the cast on __A necessary? > > Same for the others. you are right, i fixed it. https://reviews.llvm.org/D39719 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part
jina.nahias updated this revision to Diff 121908. https://reviews.llvm.org/D38672 Files: lib/Headers/avx512fintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512f-builtins.c test/CodeGen/avx512vl-builtins.c Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -5602,73 +5602,85 @@ } __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias added inline comments. Comment at: test/CodeGen/avx512f-builtins.c:6231 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 craig.topper wrote: > Does this really produce kunpackb in the backend? The type promotion here > makes me skeptic yes, the code we get: vpcmpneqd %zmm1, %zmm0, %k0 vpcmpneqd %zmm3, %zmm2, %k1 kunpckbw%k1, %k0, %k1 vpcmpneqd %zmm5, %zmm4, %k0 {%k1} kmovd %k0, %eax # kill: %AX %AX %EAX vzeroupper https://reviews.llvm.org/D39719 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part
jina.nahias updated this revision to Diff 122617. https://reviews.llvm.org/D39719 Files: lib/Headers/avx512bwintrin.h lib/Headers/avx512fintrin.h test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512f-builtins.c Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,17 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B), + _mm512_cmpneq_epu32_mask(__C, __D)), + __E, __F); } __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) { Index: test/CodeGen/avx512bw-builtins.c === --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1626,16 +1626,26 @@ return _mm512_maskz_set1_epi8(__M, __A); } -__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackd - // CHECK: @llvm.x86.avx512.kunpck.dq - return _mm512_kunpackd(__A, __B); + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: and i64 %{{.*}}, 4294967295 + // CHECK: shl i64 %{{.*}}, 32 + // CHECK: or i64 %{{.*}}, %{{.*}} + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); } -__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackw - // CHECK: @llvm.x86.avx512.kunpck.wd - return _mm512_kunpackw(__A, __B); + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: and i32 %{{.*}}, 65535 + // CHECK: shl i32 %{{.*}}, 16 + // CHECK: or i32 %{{.*}}, %{{.*}} + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); } __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { Index: lib/Headers/avx512fintrin.h === --- lib/Headers/avx512fintrin.h +++ lib/Headers/avx512fintrin.h @@ -8823,7 +8823,7 @@ static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS Index: lib/Headers/avx512bwintrin.h === --- lib/Headers/avx512bwintrin.h +++ lib/Headers/avx512bwintrin.h @@ -1854,15 +1854,13 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, -(__mmask64) __B); + return (__mmask64) (( __A & 0x) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, -(__mmask32) __B); +return (__mmask32) (( __A & 0x) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -6224,10 +6224,17 @@ return _mm512_kortestz(__A, __B); } -__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: @test_mm512_kunpackb - // CHECK: @llvm.x86.avx512.kunpck.bw - return _mm512_kunpackb(__A, __B); + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: and i32 %{{.*}}, 255 + // CHECK: shl i32 %{{.*}}, 8 + // CHECK: or i32 %{{.
[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part
This revision was automatically updated to reflect the committed changes. Closed by commit rL318025: [x86][AVX512] Lowering shuffle i/f intrinsics to LLVM IR (authored by jina.nahias). Changed prior to commit: https://reviews.llvm.org/D38672?vs=121908&id=122619#toc Repository: rL LLVM https://reviews.llvm.org/D38672 Files: cfe/trunk/lib/Headers/avx512fintrin.h cfe/trunk/lib/Headers/avx512vlintrin.h cfe/trunk/test/CodeGen/avx512f-builtins.c cfe/trunk/test/CodeGen/avx512vl-builtins.c Index: cfe/trunk/test/CodeGen/avx512f-builtins.c === --- cfe/trunk/test/CodeGen/avx512f-builtins.c +++ cfe/trunk/test/CodeGen/avx512f-builtins.c @@ -4484,73 +4484,81 @@ __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> return _mm512_shuffle_f32x4(__A, __B, 4); } __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); } __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.f32x4 + // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); } __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %0, <8 x double> %{{.*}}, <8 x i32> return _mm512_shuffle_f64x2(__A, __B, 4); } __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); } __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.f64x2 + // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); } __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> return _mm512_shuffle_i32x4(__A, __B, 4); } __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); } __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4 - // CHECK: @llvm.x86.avx512.mask.shuf.i32x4 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); } __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> return _mm512_shuffle_i64x2(__A, __B, 4); } __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); } __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2 - // CHECK: @llvm.x86.avx512.mask.shuf.i64x2 + // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %