Hi,

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

        PR target/104978
        * config/i386/sse.md
        (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
        Generate mask & 1 before move to dest under TARGET_AVX512VL.
        (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.

gcc/testsuite/ChangeLog:

        PR target/104978
        * gcc.target/i386/pr104978.c: New test.
---
 gcc/config/i386/sse.md                   | 16 ++++++++++------
 gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ed98120be59..cc4c5542ee6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6576,7 +6576,7 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
@@ -6590,11 +6590,13 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
   {
     op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
     op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+    mask = gen_reg_rtx (QImode);
+    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-    rtx mask, tmp, vec_mask;
+    rtx tmp, vec_mask;
     mask = lowpart_subreg (SImode, operands[4], QImode),
     tmp = gen_reg_rtx (SImode);
     emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
@@ -6631,7 +6633,7 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
@@ -6645,11 +6647,13 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
   {
     op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
     op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+    mask = gen_reg_rtx (QImode);
+    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-    rtx mask, tmp, vec_mask;
+    rtx tmp, vec_mask;
     mask = lowpart_subreg (SImode, operands[4], QImode),
     tmp = gen_reg_rtx (SImode);
     emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c 
b/gcc/testsuite/gcc.target/i386/pr104978.c
new file mode 100644
index 00000000000..fd22a6c3f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104978.c
@@ -0,0 +1,18 @@
+/* PR target/104978 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
+
+#include<immintrin.h>
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
+}
+
+__m128h
+foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
+}
-- 
2.18.1

Reply via email to