craig.topper created this revision.
craig.topper added reviewers: RKSimon, spatel.
Herald added a subscriber: cfe-commits.

I believe all the pieces are now in place in the backend to make this 
correctly. We can truncate the vXi64 type to vXi32, extend it back up to the 
original width and multiply.

In the backend the truncate+extend will becomes 
sign_extend_inreg/zero_extend_inreg(really an and). Then those will be combined 
with the mul to PMULDQ/PMULUDQ. Then SimplifyDemandedBits will strip the 
sign_extend_inreg/zero_extend_inreg out.

The only question I have is whether its ok to emit the v2i32 intermediate type 
for the 128-bit version. I wasn't sure of any examples where we use an illegal 
type in our intrinsic/builtin handling. At least not a narrower type. I know 
pavg uses a wider type.

I think I could probably do this all in the header file using 
__builtin_convertvector if that's desired.


Repository:
  rC Clang

https://reviews.llvm.org/D45421

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/avx2-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/sse2-builtins.c
  test/CodeGen/sse41-builtins.c

Index: test/CodeGen/sse41-builtins.c
===================================================================
--- test/CodeGen/sse41-builtins.c
+++ test/CodeGen/sse41-builtins.c
@@ -312,7 +312,11 @@
 
 __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mul_epi32
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
 
Index: test/CodeGen/sse2-builtins.c
===================================================================
--- test/CodeGen/sse2-builtins.c
+++ test/CodeGen/sse2-builtins.c
@@ -816,7 +816,11 @@
 
 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_mul_epu32
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
 
Index: test/CodeGen/avx512vl-builtins.c
===================================================================
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -727,60 +727,92 @@
 __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epi32
-  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epi32(__M, __X, __Y);
 }
 
 
 __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epi32
-  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epi32
-  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: sext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epi32(__M, __X, __Y);
 }
 
 __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epu32
-  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  //CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epu32(__M, __X, __Y);
 }
 
 __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epu32
-  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epu32
-  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  //CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: zext <2 x i32> %{{.*}} to <2 x i64>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epu32(__M, __X, __Y);
 }
Index: test/CodeGen/avx512f-builtins.c
===================================================================
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -1874,42 +1874,66 @@
 
 __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
 
 __m512i test_mm512_maskz_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epi32(__src,__k,__A,__B);
 }
 
 __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
 
 __m512i test_mm512_maskz_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epu32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B, 
                                    __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epu32(__src,__k,__A,__B);
 }
Index: test/CodeGen/avx2-builtins.c
===================================================================
--- test/CodeGen/avx2-builtins.c
+++ test/CodeGen/avx2-builtins.c
@@ -835,13 +835,21 @@
 
 __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epi32
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
 
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -8264,6 +8264,26 @@
   return EmitX86Select(CGF, Ops[3], Res, Ops[2]);
 }
 
+static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
+                           ArrayRef<Value *> Ops) {
+  llvm::Type *Ty = Ops[0]->getType();
+  // Arguments have a vXi32 type so cast to vXi64.
+  llvm::Type *CastTy = llvm::VectorType::get(CGF.Int64Ty,
+                                             Ty->getVectorNumElements() / 2);
+  Value *LHS = CGF.Builder.CreateBitCast(Ops[0], CastTy);
+  Value *RHS = CGF.Builder.CreateBitCast(Ops[1], CastTy);
+
+  // Truncate and then extend.
+  llvm::Type *TruncTy = llvm::VectorType::get(CGF.Int32Ty,
+                                              Ty->getVectorNumElements() / 2);
+  LHS = CGF.Builder.CreateTrunc(LHS, TruncTy);
+  RHS = CGF.Builder.CreateTrunc(RHS, TruncTy);
+  LHS = CGF.Builder.CreateIntCast(LHS, CastTy, IsSigned);
+  RHS = CGF.Builder.CreateIntCast(RHS, CastTy, IsSigned);
+
+  return CGF.Builder.CreateMul(LHS, RHS);
+}
+
 static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, 
                               llvm::Type *DstTy) {
   unsigned NumberOfElements = DstTy->getVectorNumElements();
@@ -8968,6 +8988,16 @@
   case X86::BI__builtin_ia32_pminuq512_mask:
     return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops);
 
+  case X86::BI__builtin_ia32_pmuludq128:
+  case X86::BI__builtin_ia32_pmuludq256:
+  case X86::BI__builtin_ia32_pmuludq512:
+    return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
+
+  case X86::BI__builtin_ia32_pmuldq128:
+  case X86::BI__builtin_ia32_pmuldq256:
+  case X86::BI__builtin_ia32_pmuldq512:
+    return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
+
   // 3DNow!
   case X86::BI__builtin_ia32_pswapdsf:
   case X86::BI__builtin_ia32_pswapdsi: {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to