Hi, For complex scalar intrinsic like _mm_mask_fcmadd_sch, the mask should be and by 1 to ensure the mask is bind to lowest byte.
Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. Ok for master? gcc/ChangeLog: PR target/104978 * config/i386/sse.md (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name): Generate mask & 1 before move to dest under TARGET_AVX512VL. (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise. gcc/testsuite/ChangeLog: PR target/104978 * gcc.target/i386/pr104978.c: New test. --- gcc/config/i386/sse.md | 16 ++++++++++------ gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ed98120be59..cc4c5542ee6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6576,7 +6576,7 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" (match_operand:QI 4 "register_operand")] "TARGET_AVX512FP16 && <round_mode512bit_condition>" { - rtx op0, op1; + rtx op0, op1, mask; if (<round_embedded_complex>) emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( @@ -6590,11 +6590,13 @@ (define_expand "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" { op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); + mask = gen_reg_rtx (QImode); + emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1))); + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask)); } else { - rtx mask, tmp, vec_mask; + rtx tmp, vec_mask; mask = lowpart_subreg (SImode, operands[4], QImode), tmp = gen_reg_rtx (SImode); emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); @@ -6631,7 +6633,7 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" (match_operand:QI 4 "register_operand")] "TARGET_AVX512FP16 && <round_mode512bit_condition>" { - rtx op0, op1; + rtx op0, op1, mask; if (<round_embedded_complex>) emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( @@ -6645,11 +6647,13 @@ (define_expand "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" { op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); + mask = gen_reg_rtx (QImode); + emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1))); + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask)); } else { - rtx mask, tmp, vec_mask; + rtx tmp, vec_mask; mask = lowpart_subreg (SImode, operands[4], QImode), tmp = gen_reg_rtx (SImode); emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c b/gcc/testsuite/gcc.target/i386/pr104978.c new file mode 100644 index 00000000000..fd22a6c3f43 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr104978.c @@ -0,0 +1,18 @@ +/* PR target/104978 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */ + +#include<immintrin.h> + +__m128h +foo (__m128h a, __m128h b, __m128h c, __mmask8 m) +{ + return _mm_mask_fmadd_round_sch (a, m, b, c, 8); +} + +__m128h +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m) +{ + return _mm_mask_fcmadd_round_sch (a, m, b, c, 8); +} -- 2.18.1