[PATCH] D38231: fixing a bug in mask[z]_set1

2017-09-25 Thread jina via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL314102: fixing a bug in mask[z]_set1 intrinsic (authored by 
jina.nahias).

Changed prior to commit:
  https://reviews.llvm.org/D38231?vs=116513&id=116545#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D38231

Files:
  cfe/trunk/lib/Headers/avx512vlintrin.h
  cfe/trunk/test/CodeGen/avx512vl-builtins.c


Index: cfe/trunk/lib/Headers/avx512vlintrin.h
===
--- cfe/trunk/lib/Headers/avx512vlintrin.h
+++ cfe/trunk/lib/Headers/avx512vlintrin.h
@@ -5761,15 +5761,15 @@
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64x(__A),
   (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64x(__A),
   (__v2di) _mm_setzero_si128());
 }
 
Index: cfe/trunk/test/CodeGen/avx512vl-builtins.c
===
--- cfe/trunk/test/CodeGen/avx512vl-builtins.c
+++ cfe/trunk/test/CodeGen/avx512vl-builtins.c
@@ -4563,45 +4563,17 @@
 #ifdef __x86_64__
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_mask_set1_epi64
-  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> 
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_set1_epi64(__O, __M, __A); 
 }
 
 __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_maskz_set1_epi64
-  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> 
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_set1_epi64(__M, __A); 


Index: cfe/trunk/lib/Headers/avx512vlintrin.h
===
--- cfe/trunk/lib/Headers/avx512vlintrin.h
+++ cfe/trunk/lib/Headers/avx512vlintrin.h
@@ -5761,15 +5761,15 @@
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+

[PATCH] D38672: lowering shuffle f/i intrinsic - clang part

2017-10-08 Thread jina via Phabricator via cfe-commits
jina.nahias created this revision.

https://reviews.llvm.org/D38672

Files:
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c

Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -5630,73 +5630,85 @@
 }
 __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_f32x4(__A, __B, 3); 
 }
 
 __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); 
 }
 
 __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_f64x2(__A, __B, 3); 
 }
 
 __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}1, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i32x4(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i64x2(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}},

[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part

2017-10-08 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 118166.

https://reviews.llvm.org/D38672

Files:
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c

Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -5630,73 +5630,85 @@
 }
 __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
   return _mm256_shuffle_f32x4(__A, __B, 3); 
 }
 
 __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); 
 }
 
 __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_f64x2(__A, __B, 3); 
 }
 
 __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i32x4(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i64x2(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*

[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part

2017-10-09 Thread jina via Phabricator via cfe-commits
jina.nahias created this revision.

https://reviews.llvm.org/D38683

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/Headers/avx512cdintrin.h
  lib/Headers/avx512vlcdintrin.h
  test/CodeGen/avx512cdintrin.c
  test/CodeGen/avx512vlcd-builtins.c

Index: test/CodeGen/avx512vlcd-builtins.c
===
--- test/CodeGen/avx512vlcd-builtins.c
+++ test/CodeGen/avx512vlcd-builtins.c
@@ -3,28 +3,56 @@
 
 #include 
 
-__m128i test_mm_broadcastmb_epi64(__mmask8 __A) {
+__m128i test_mm_broadcastmb_epi64(__m128i a,__m128i b) {
   // CHECK-LABEL: @test_mm_broadcastmb_epi64
-  // CHECK: @llvm.x86.avx512.broadcastmb.128
-  return _mm_broadcastmb_epi64(__A); 
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> 
+  // CHECK: bitcast <8 x i1> %{{.*}} to i8
+  // CHECK: zext i8 %{{.*}} to i64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_broadcastmb_epi64(_mm_cmpeq_epi32_mask (a, b)); 
 }
 
-__m256i test_mm256_broadcastmb_epi64(__mmask8 __A) {
+__m256i test_mm256_broadcastmb_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: @test_mm256_broadcastmb_epi64
-  // CHECK: @llvm.x86.avx512.broadcastmb.256
-  return _mm256_broadcastmb_epi64(__A); 
-}
-
-__m128i test_mm_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> 
+  // CHECK: bitcast <8 x i1> %{{.*}} to i8
+  // CHECK: zext i8 %{{.*}} to i64
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_broadcastmb_epi64(_mm256_cmpeq_epi64_mask ( a, b)); 
+}
+
+__m128i test_mm_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: @test_mm_broadcastmw_epi32
-  // CHECK: @llvm.x86.avx512.broadcastmw.128
-  return _mm_broadcastmw_epi32(__A); 
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b));
 }
 
-__m256i test_mm256_broadcastmw_epi32(__mmask16 __A) {
+__m256i test_mm256_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: @test_mm256_broadcastmw_epi32
-  // CHECK: @llvm.x86.avx512.broadcastmw.256
-  return _mm256_broadcastmw_epi32(__A); 
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); 
 }
 
 __m128i test_mm_conflict_epi64(__m128i __A) {
Index: test/CodeGen/avx512cdintrin.c
===
--- test/CodeGen/avx512cdintrin.c
+++ test/CodeGen/avx512cdintrin.c
@@ -68,14 +68,40 @@
   return _mm512_maskz_lzcnt_epi64(__U,__A); 
 }
 
-__m512i test_mm512_broadcastmb_epi64(__mmask8 __A) {
+__m512i test_mm512_broadcastmb_epi64(__m512i a, __m512i b) {
   // CHECK-LABEL: @test_mm512_broadcastmb_epi64
-  // CHECK: @llvm.x86.avx512.broadcastmb.512
-  return _mm512_broadcastmb_epi64(__A); 
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: zext i8 %{{.*}} to i64
+  // CHECK: insertelement <8 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 7
+  return _mm512_broadcastmb_epi64(_mm512_cmpeq_epu64_mask ( a, b)); 
 }
 
-__m512i test_mm512_broadcastmw_epi32(__mmask16 __A) {
+__m512i test_mm512_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: @test_mm512_broadcastmw_epi32
-  // CHECK: @

[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part

2017-10-23 Thread jina via Phabricator via cfe-commits
jina.nahias added inline comments.



Comment at: test/CodeGen/avx512cdintrin.c:106
+  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}
+  return _mm512_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); 
 }

RKSimon wrote:
> Any reason why you can't use the actual insertion indices instead of regexps?
yes, 
the cmpeq intrinsic returns a mask type.
if we wont send a mask type to the broadcastm intrinsic it would do a regular 
broadcast (not a mask broadcast).


https://reviews.llvm.org/D38683



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-10 Thread jina via Phabricator via cfe-commits
jina.nahias created this revision.

this is clang part , the llvm part is 
https://reviews.llvm.org/differential/diff/114515/


https://reviews.llvm.org/D37668

Files:
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -2602,28 +2602,196 @@
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+  __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
+// CHECK-LABEL: @test_mm_mask_set1_epi8
+// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+	return _mm_mask_set1_epi8(__O, __M, __A);
+	}
+
+  __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
+// CHECK-LABEL: @test_mm_maskz_set1_epi8
+	// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+	return _mm_maskz_set1_epi8( __M, __A);
+	}
+
+__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi8
+// CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: in

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-11 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 114594.
jina.nahias added a comment.

delete from include/clang/Basic/BuiltinsX86.def and 
include/clang/Basic/BuiltinsX86_64.def


https://reviews.llvm.org/D37668

Files:
  include/clang/Basic/BuiltinsX86.def
  include/clang/Basic/BuiltinsX86_64.def


Index: include/clang/Basic/BuiltinsX86_64.def
===
--- include/clang/Basic/BuiltinsX86_64.def
+++ include/clang/Basic/BuiltinsX86_64.def
@@ -71,9 +71,6 @@
 TARGET_BUILTIN(__builtin_ia32_bextri_u64, "ULLiULLiIULLi", "", "tbm")
 TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcULLiUiUi", "", "lwp")
 TARGET_BUILTIN(__builtin_ia32_lwpval64, "vULLiUiUi", "", "lwp")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_gpr_mask, "V8LLiLLiV8LLiUc", "", 
"avx512f")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq128_gpr_mask, 
"V2LLiULLiV2LLiUc","","avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq256_gpr_mask, 
"V4LLiULLiV4LLiUc","","avx512vl")
 TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi","","avx512f")
Index: include/clang/Basic/BuiltinsX86.def
===
--- include/clang/Basic/BuiltinsX86.def
+++ include/clang/Basic/BuiltinsX86.def
@@ -977,7 +977,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8LLiV16iV16i", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_ptestmd512, "UsV16iV16iUs", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_ptestmq512, "UcV8LLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8LLiLLiC*V8LLiUc", "", 
"avx512f")
@@ -1381,11 +1380,6 @@
 TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, 
"V4LLiV4LLiC*V4LLiUc","","avx512vl")
 TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, 
"vV2LLi*V2LLiUc","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, 
"vV4LLi*V4LLiUc","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastb512_gpr_mask, 
"V64ccV64cULLi","","avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastb128_gpr_mask, 
"V16ccV16cUs","","avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastb256_gpr_mask, 
"V32ccV32cUi","","avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastd128_gpr_mask, 
"V4iiV4iUc","","avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastd256_gpr_mask, 
"V8iiV8iUc","","avx512vl")
 TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_mask, 
"V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
 TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_maskz, 
"V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
 TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_mask, 
"V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
@@ -1596,9 +1590,6 @@
 TARGET_BUILTIN(__builtin_ia32_broadcastmb256, "V4LLiUc","","avx512cd,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_broadcastmw128, "V4iUs","","avx512cd,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_broadcastmw256, "V8iUs","","avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastw512_gpr_mask, 
"V32shV32sUi","","avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastw256_gpr_mask, 
"V16shV16sUs","","avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastw128_gpr_mask, 
"V8ssV8sUc","","avx512bw,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi","","avx512bw")


Index: include/clang/Basic/BuiltinsX86_64.def
===
--- include/clang/Basic/BuiltinsX86_64.def
+++ include/clang/Basic/BuiltinsX86_64.def
@@ -71,9 +71,6 @@
 TARGET_BUILTIN(__builtin_ia32_bextri_u64, "ULLiULLiIULLi", "", "tbm")
 TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcULLiUiUi", "", "lwp")
 TARGET_BUILTIN(__builtin_ia32_lwpval64, "vULLiUiUi", "", "lwp")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_gpr_mask, "V8LLiLLiV8LLiUc", "", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq128_gpr_mask, "V2LLiULLiV2LLiUc","","avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pbroadcastq256_gpr_mask, "V4LLiULLiV4LLiUc","","avx512vl")
 TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "LLiV2dIi","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "ULLiV2dIi","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "LLiV4fIi","","avx512f")
Index: include/clang/Basic/BuiltinsX86.def
===
--- include/clang/Basic/BuiltinsX86.def
+++ include/clang/Basic/BuiltinsX86.def
@@ -977,7 +977,6 @@
 TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8LLiV16iV16i", "", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_ptestmd

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-11 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 114765.

https://reviews.llvm.org/D37668

Files:
  include/clang/Basic/BuiltinsX86.def
  include/clang/Basic/BuiltinsX86_64.def
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -2602,28 +2602,196 @@
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+  __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
+// CHECK-LABEL: @test_mm_mask_set1_epi8
+// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+	return _mm_mask_set1_epi8(__O, __M, __A);
+	}
+
+  __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
+// CHECK-LABEL: @test_mm_maskz_set1_epi8
+	// CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+// CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+	return _mm_maskz_set1_epi8( __M, __A);
+	}
+
+__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi8
+// CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: 

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-12 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 114836.

https://reviews.llvm.org/D37668

Files:
  include/clang/Basic/BuiltinsX86.def
  include/clang/Basic/BuiltinsX86_64.def
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -2602,28 +2602,195 @@
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_mask_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_set1_epi8(__O, __M, __A);
+}
+__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_maskz_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_set1_epi8( __M, __A);
+}
+
+__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelemen

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-12 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 114978.

https://reviews.llvm.org/D37668

Files:
  include/clang/Basic/BuiltinsX86.def
  include/clang/Basic/BuiltinsX86_64.def
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -2602,28 +2602,195 @@
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_mask_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_set1_epi8(__O, __M, __A);
+}
+__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_maskz_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_set1_epi8( __M, __A);
+}
+
+__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelemen

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-13 Thread jina via Phabricator via cfe-commits
jina.nahias added inline comments.



Comment at: include/clang/Basic/BuiltinsX86.def:981
-TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", 
"avx512f")

craig.topper wrote:
> I think you patch removed the only use of 
> __builtin_ia32_pbroadcastq512_mem_mask right? Does your change work properly 
> in 32-bit mode?
yes for both of the questions, i have deleted 
__builtin_ia32_pbroadcastq512_mem_mask  you can see it in the new update.


https://reviews.llvm.org/D37668



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-18 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 115622.
jina.nahias added a comment.

rebase on @craig.topper  commit.


https://reviews.llvm.org/D37668

Files:
  include/clang/Basic/BuiltinsX86.def
  include/clang/Basic/BuiltinsX86_64.def
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -2670,28 +2670,195 @@
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+__m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_mask_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_set1_epi8(__O, __M, __A);
+}
+__m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
+  // CHECK-LABEL: @test_mm_maskz_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_set1_epi8( __M, __A);
+}
+
+__m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement

[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-18 Thread jina via Phabricator via cfe-commits
jina.nahias added inline comments.



Comment at: lib/Headers/avx512fintrin.h:9742
 
 #ifdef __x86_64__
 static __inline__ __m512i __DEFAULT_FN_ATTRS

craig.topper wrote:
> Please remove the #ifdef __x86_64__ from this. It should work in 32-bits as 
> well.
the current generated code for 32-bit is not optimal, 32-bit needs some more 
work which will be in a following patch .


https://reviews.llvm.org/D37668



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37668: [X86][intrinsics] lower _mm[256|512]_mask[z]_set1_epi[8|16|32|64] intrinsic to IR

2017-09-19 Thread jina via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL313624: Lowering Mask Set1 intrinsics to LLVM IR (authored 
by jina.nahias).

Changed prior to commit:
  https://reviews.llvm.org/D37668?vs=115622&id=115823#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D37668

Files:
  cfe/trunk/include/clang/Basic/BuiltinsX86.def
  cfe/trunk/include/clang/Basic/BuiltinsX86_64.def
  cfe/trunk/lib/Headers/avx512bwintrin.h
  cfe/trunk/lib/Headers/avx512fintrin.h
  cfe/trunk/lib/Headers/avx512vlbwintrin.h
  cfe/trunk/lib/Headers/avx512vlintrin.h
  cfe/trunk/test/CodeGen/avx512bw-builtins.c
  cfe/trunk/test/CodeGen/avx512f-builtins.c
  cfe/trunk/test/CodeGen/avx512vl-builtins.c
  cfe/trunk/test/CodeGen/avx512vlbw-builtins.c

Index: cfe/trunk/lib/Headers/avx512vlintrin.h
===
--- cfe/trunk/lib/Headers/avx512vlintrin.h
+++ cfe/trunk/lib/Headers/avx512vlintrin.h
@@ -5723,59 +5723,72 @@
   (__v4df)_mm256_setzero_pd());
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+  (__v4si) _mm_set1_epi32(__A),
+  (__v4si)__O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+  (__v4si) _mm_set1_epi32(__A),
+  (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+  (__v8si) _mm256_set1_epi32(__A),
+  (__v8si)__O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+  (__v8si) _mm256_set1_epi32(__A),
+  (__v8si)_mm256_setzero_si256());
+}
 
-#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-  (__v4si)(__m128i)(O), \
-  (__mmask8)(M)); })
-
-#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-  (__v4si)_mm_setzero_si128(), \
-  (__mmask8)(M)); })
-
-#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-  (__v8si)(__m256i)(O), \
-  (__mmask8)(M)); })
-
-#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-  (__v8si)_mm256_setzero_si256(), \
-  (__mmask8)(M)); })
 
 #ifdef __x86_64__
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
- __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
- (__v2di)
- _mm_setzero_si128 (),
- __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
- __M);
+  return (__m256i) __builtin_ia32_selectq_256(__M,
+  (__v4di) _mm256_set1_epi64x(__A),
+  (__v4di) __O) ;
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
- (__v4di)
- _mm256_setzero_si256 (),
- __M);
+   return (__m256i) __builtin_i

[PATCH] D38231: fixing a bug in mask[z]_set1

2017-09-25 Thread jina via Phabricator via cfe-commits
jina.nahias created this revision.

https://reviews.llvm.org/D38231

Files:
  lib/Headers/avx512vlintrin.h


Index: lib/Headers/avx512vlintrin.h
===
--- lib/Headers/avx512vlintrin.h
+++ lib/Headers/avx512vlintrin.h
@@ -5761,15 +5761,15 @@
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64(__A),
   (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64(__A),
   (__v2di) _mm_setzero_si128());
 }
 


Index: lib/Headers/avx512vlintrin.h
===
--- lib/Headers/avx512vlintrin.h
+++ lib/Headers/avx512vlintrin.h
@@ -5761,15 +5761,15 @@
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64(__A),
   (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64(__A),
   (__v2di) _mm_setzero_si128());
 }
 
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38231: fixing a bug in mask[z]_set1

2017-09-25 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 116513.

https://reviews.llvm.org/D38231

Files:
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512vl-builtins.c


Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -4563,45 +4563,17 @@
 #ifdef __x86_64__
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_mask_set1_epi64
-  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> 
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_set1_epi64(__O, __M, __A); 
 }
 
 __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_maskz_set1_epi64
-  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> 
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_set1_epi64(__M, __A); 
Index: lib/Headers/avx512vlintrin.h
===
--- lib/Headers/avx512vlintrin.h
+++ lib/Headers/avx512vlintrin.h
@@ -5761,15 +5761,15 @@
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64x(__A),
   (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   return (__m128i) __builtin_ia32_selectq_128(__M,
-  (__v2di) _mm_set1_epi8(__A),
+  (__v2di) _mm_set1_epi64x(__A),
   (__v2di) _mm_setzero_si128());
 }
 


Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -4563,45 +4563,17 @@
 #ifdef __x86_64__
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_mask_set1_epi64
-  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelemen

[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-12-05 Thread jina via Phabricator via cfe-commits
jina.nahias added a comment.

ping


https://reviews.llvm.org/D39719



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-12-05 Thread jina via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL319777: [x86][AVX512] Lowering kunpack intrinsics to LLVM IR 
(authored by jina.nahias).

Changed prior to commit:
  https://reviews.llvm.org/D39719?vs=122617&id=125525#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D39719

Files:
  cfe/trunk/lib/Headers/avx512bwintrin.h
  cfe/trunk/lib/Headers/avx512fintrin.h
  cfe/trunk/test/CodeGen/avx512bw-builtins.c
  cfe/trunk/test/CodeGen/avx512f-builtins.c


Index: cfe/trunk/lib/Headers/avx512fintrin.h
===
--- cfe/trunk/lib/Headers/avx512fintrin.h
+++ cfe/trunk/lib/Headers/avx512fintrin.h
@@ -8787,7 +8787,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) 
__B);
+  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
Index: cfe/trunk/lib/Headers/avx512bwintrin.h
===
--- cfe/trunk/lib/Headers/avx512bwintrin.h
+++ cfe/trunk/lib/Headers/avx512bwintrin.h
@@ -1854,15 +1854,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __A  & 0x) | ( __B << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-(__mmask32) __B);
+return (__mmask32)  (( __A  & 0x) | ( __B << 16));
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
Index: cfe/trunk/test/CodeGen/avx512bw-builtins.c
===
--- cfe/trunk/test/CodeGen/avx512bw-builtins.c
+++ cfe/trunk/test/CodeGen/avx512bw-builtins.c
@@ -1626,16 +1626,26 @@
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
 
-__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: @llvm.x86.avx512.kunpck.dq
-  return _mm512_kunpackd(__A, __B); 
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: and i64 %{{.*}}, 4294967295
+  // CHECK: shl i64 %{{.*}}, 32
+  // CHECK: or i64 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  return 
_mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, 
__A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
-__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: @llvm.x86.avx512.kunpck.wd
-  return _mm512_kunpackw(__A, __B); 
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: and i32 %{{.*}}, 65535
+  // CHECK: shl i32 %{{.*}}, 16
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  return 
_mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, 
__A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const 
*__P) {
Index: cfe/trunk/test/CodeGen/avx512f-builtins.c
===
--- cfe/trunk/test/CodeGen/avx512f-builtins.c
+++ cfe/trunk/test/CodeGen/avx512f-builtins.c
@@ -6241,10 +6241,17 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return 
_mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, 
__B),
+   
_mm512_cmpneq_epu32_mask(__C, __D)),
+   __E, __F);
 }
 
 __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {


Index: cfe/trunk/lib/Headers/avx512fintrin.h
===
--- cfe/trunk/lib/Headers/avx512fintrin.h
+++ cfe/trunk/lib/Headers/avx512fintrin.h
@@ -8787,7 +8787,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  

[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-12-05 Thread jina via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC319777: [x86][AVX512] Lowering kunpack intrinsics to LLVM IR 
(authored by jina.nahias).

Repository:
  rC Clang

https://reviews.llvm.org/D39719

Files:
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c


Index: lib/Headers/avx512bwintrin.h
===
--- lib/Headers/avx512bwintrin.h
+++ lib/Headers/avx512bwintrin.h
@@ -1854,15 +1854,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __A  & 0x) | ( __B << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-(__mmask32) __B);
+return (__mmask32)  (( __A  & 0x) | ( __B << 16));
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
Index: lib/Headers/avx512fintrin.h
===
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -8787,7 +8787,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) 
__B);
+  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6241,10 +6241,17 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return 
_mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, 
__B),
+   
_mm512_cmpneq_epu32_mask(__C, __D)),
+   __E, __F);
 }
 
 __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
Index: test/CodeGen/avx512bw-builtins.c
===
--- test/CodeGen/avx512bw-builtins.c
+++ test/CodeGen/avx512bw-builtins.c
@@ -1626,16 +1626,26 @@
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
 
-__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: @llvm.x86.avx512.kunpck.dq
-  return _mm512_kunpackd(__A, __B); 
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: and i64 %{{.*}}, 4294967295
+  // CHECK: shl i64 %{{.*}}, 32
+  // CHECK: or i64 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  return 
_mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, 
__A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
-__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: @llvm.x86.avx512.kunpck.wd
-  return _mm512_kunpackw(__A, __B); 
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: and i32 %{{.*}}, 65535
+  // CHECK: shl i32 %{{.*}}, 16
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  return 
_mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, 
__A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const 
*__P) {


Index: lib/Headers/avx512bwintrin.h
===
--- lib/Headers/avx512bwintrin.h
+++ lib/Headers/avx512bwintrin.h
@@ -1854,15 +1854,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __A  & 0x) | ( __B << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  ret

[PATCH] D38683: [X86][AVX512] lowering broadcastm intrinsic - clang part

2017-11-05 Thread jina via Phabricator via cfe-commits
jina.nahias added a comment.

commit in https://reviews.llvm.org/rL317456


https://reviews.llvm.org/D38683



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-11-07 Thread jina via Phabricator via cfe-commits
jina.nahias created this revision.

https://reviews.llvm.org/D39719

Files:
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,15 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return 
_mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A,__B),_mm512_cmpneq_epu32_mask(__C,__D)),__E,
 __F);
 }
 
 __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
Index: test/CodeGen/avx512bw-builtins.c
===
--- test/CodeGen/avx512bw-builtins.c
+++ test/CodeGen/avx512bw-builtins.c
@@ -1626,16 +1626,26 @@
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
 
-__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: @llvm.x86.avx512.kunpck.dq
-  return _mm512_kunpackd(__A, __B); 
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: and i64 %{{.*}}, 4294967295
+  // CHECK: shl i64 %{{.*}}, 32
+  // CHECK: or i64 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  return 
_mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, 
__A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
-__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: @llvm.x86.avx512.kunpck.wd
-  return _mm512_kunpackw(__A, __B); 
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: and i32 %{{.*}}, 65535
+  // CHECK: shl i32 %{{.*}}, 16
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  return 
_mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, 
__A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const 
*__P) {
Index: lib/Headers/avx512fintrin.h
===
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -9011,7 +9011,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) 
__B);
+  return (__mmask16)  (( __B  & 0xFF) | ((__mmask16) __A << 8));
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
Index: lib/Headers/avx512bwintrin.h
===
--- lib/Headers/avx512bwintrin.h
+++ lib/Headers/avx512bwintrin.h
@@ -2042,15 +2042,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __B  & 0x) | ((__mmask64) __A << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-(__mmask32) __B);
+return (__mmask32)  (( __B  & 0x) | ((__mmask32) __A << 16));
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,15 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return _mm512_mask_cmpneq_epu32_mask(

[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-11-07 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 121899.

https://reviews.llvm.org/D39719

Files:
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,15 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return 
_mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A,__B),_mm512_cmpneq_epu32_mask(__C,__D)),__E,
 __F);
 }
 
 __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
Index: test/CodeGen/avx512bw-builtins.c
===
--- test/CodeGen/avx512bw-builtins.c
+++ test/CodeGen/avx512bw-builtins.c
@@ -1626,16 +1626,26 @@
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
 
-__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: @llvm.x86.avx512.kunpck.dq
-  return _mm512_kunpackd(__A, __B); 
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: and i64 %{{.*}}, 4294967295
+  // CHECK: shl i64 %{{.*}}, 32
+  // CHECK: or i64 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  return 
_mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, 
__A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
-__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: @llvm.x86.avx512.kunpck.wd
-  return _mm512_kunpackw(__A, __B); 
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: and i32 %{{.*}}, 65535
+  // CHECK: shl i32 %{{.*}}, 16
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  return 
_mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, 
__A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const 
*__P) {
Index: lib/Headers/avx512fintrin.h
===
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -9011,7 +9011,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) 
__B);
+  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
Index: lib/Headers/avx512bwintrin.h
===
--- lib/Headers/avx512bwintrin.h
+++ lib/Headers/avx512bwintrin.h
@@ -2042,15 +2042,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __A  & 0x) | ( __B << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-(__mmask32) __B);
+return (__mmask32)  (( __A  & 0x) | ( __B << 16));
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,15 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_m

[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-11-07 Thread jina via Phabricator via cfe-commits
jina.nahias added inline comments.



Comment at: lib/Headers/avx512bwintrin.h:2045
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __B  & 0x) | ((__mmask64) __A << 32));
 }

RKSimon wrote:
> Is this right? The Intel docs says it should be:
> ```
> k[31:0] := a[31:0]
> k[63:32] := b[31:0]
> k[MAX:64] := 0
> ```
> 
> Also, is the cast on __A necessary?
> 
> Same for the others.
you are right,  i fixed it.


https://reviews.llvm.org/D39719



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part

2017-11-07 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 121908.

https://reviews.llvm.org/D38672

Files:
  lib/Headers/avx512fintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c

Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -5602,73 +5602,85 @@
 }
 __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
   return _mm256_shuffle_f32x4(__A, __B, 3); 
 }
 
 __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); 
 }
 
 __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_f64x2(__A, __B, 3); 
 }
 
 __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i32x4(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
 }
 
 __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
   return _mm256_shuffle_i64x2(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> 
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> 
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*

[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-11-12 Thread jina via Phabricator via cfe-commits
jina.nahias added inline comments.



Comment at: test/CodeGen/avx512f-builtins.c:6231
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8

craig.topper wrote:
> Does this really produce kunpackb in the backend? The type promotion here 
> makes me skeptic
yes,
the code we get: 
vpcmpneqd   %zmm1, %zmm0, %k0
vpcmpneqd   %zmm3, %zmm2, %k1
kunpckbw%k1, %k0, %k1
vpcmpneqd   %zmm5, %zmm4, %k0 {%k1}
kmovd   %k0, %eax
# kill: %AX %AX %EAX
vzeroupper



https://reviews.llvm.org/D39719



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D39719: [X86][AVX512] lowering kunpack intrinsic - clang part

2017-11-13 Thread jina via Phabricator via cfe-commits
jina.nahias updated this revision to Diff 122617.

https://reviews.llvm.org/D39719

Files:
  lib/Headers/avx512bwintrin.h
  lib/Headers/avx512fintrin.h
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512f-builtins.c


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,17 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  return 
_mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, 
__B),
+   
_mm512_cmpneq_epu32_mask(__C, __D)),
+   __E, __F);
 }
 
 __mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
Index: test/CodeGen/avx512bw-builtins.c
===
--- test/CodeGen/avx512bw-builtins.c
+++ test/CodeGen/avx512bw-builtins.c
@@ -1626,16 +1626,26 @@
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
 
-__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+__mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: @llvm.x86.avx512.kunpck.dq
-  return _mm512_kunpackd(__A, __B); 
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: bitcast <64 x i1> %{{.*}} to i64
+  // CHECK: and i64 %{{.*}}, 4294967295
+  // CHECK: shl i64 %{{.*}}, 32
+  // CHECK: or i64 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  return 
_mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, 
__A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
-__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+__mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i 
__D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: @llvm.x86.avx512.kunpck.wd
-  return _mm512_kunpackw(__A, __B); 
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: bitcast <32 x i1> %{{.*}} to i32
+  // CHECK: and i32 %{{.*}}, 65535
+  // CHECK: shl i32 %{{.*}}, 16
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  return 
_mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, 
__A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const 
*__P) {
Index: lib/Headers/avx512fintrin.h
===
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -8823,7 +8823,7 @@
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) 
__B);
+  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
Index: lib/Headers/avx512bwintrin.h
===
--- lib/Headers/avx512bwintrin.h
+++ lib/Headers/avx512bwintrin.h
@@ -1854,15 +1854,13 @@
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-(__mmask64) __B);
+  return (__mmask64)  (( __A  & 0x) | ( __B << 32));
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-(__mmask32) __B);
+return (__mmask32)  (( __A  & 0x) | ( __B << 16));
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS


Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6224,10 +6224,17 @@
   return _mm512_kortestz(__A, __B); 
 }
 
-__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+__mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: @llvm.x86.avx512.kunpck.bw
-  return _mm512_kunpackb(__A, __B); 
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: bitcast <16 x i1> %{{.*}} to i16
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.

[PATCH] D38672: [X86][AVX512] lowering shuffle f/i intrinsic - clang part

2017-11-13 Thread jina via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL318025: [x86][AVX512] Lowering shuffle i/f intrinsics to 
LLVM IR (authored by jina.nahias).

Changed prior to commit:
  https://reviews.llvm.org/D38672?vs=121908&id=122619#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D38672

Files:
  cfe/trunk/lib/Headers/avx512fintrin.h
  cfe/trunk/lib/Headers/avx512vlintrin.h
  cfe/trunk/test/CodeGen/avx512f-builtins.c
  cfe/trunk/test/CodeGen/avx512vl-builtins.c

Index: cfe/trunk/test/CodeGen/avx512f-builtins.c
===
--- cfe/trunk/test/CodeGen/avx512f-builtins.c
+++ cfe/trunk/test/CodeGen/avx512f-builtins.c
@@ -4484,73 +4484,81 @@
 
 __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> 
   return _mm512_shuffle_f32x4(__A, __B, 4); 
 }
 
 __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> 
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); 
 }
 
 __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> 
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); 
 }
 
 __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <8 x double> %0, <8 x double> %{{.*}}, <8 x i32> 
   return _mm512_shuffle_f64x2(__A, __B, 4); 
 }
 
 __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); 
 }
 
 __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); 
 }
 
 __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> 
   return _mm512_shuffle_i32x4(__A, __B, 4); 
 }
 
 __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
 }
 
 __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
-  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
 }
 
 __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <8 x i64> %0, <8 x i64> %{{.*}}, <8 x i32> 
   return _mm512_shuffle_i64x2(__A, __B, 4); 
 }
 
 __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); 
 }
 
 __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2
-  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> 
+  // CHECK: select <8 x i1> %