Re: [PATCH] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

Hongtao Liu via Gcc-patches Sun, 20 Mar 2022 18:08:23 -0700

On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> mask should be and by 1 to ensure the mask is bind to lowest byte.
>
> Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> Ok for master?
>
> gcc/ChangeLog:
>
>         PR target/104978
>         * config/i386/sse.md
>         (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
>         Generate mask & 1 before move to dest under TARGET_AVX512VL.
>         (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/104978
>         * gcc.target/i386/pr104978.c: New test.
> ---
>  gcc/config/i386/sse.md                   | 16 ++++++++++------
>  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++
>  2 files changed, 28 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index ed98120be59..cc4c5542ee6 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -6576,7 +6576,7 @@ (define_expand 
> "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
>     (match_operand:QI 4 "register_operand")]
>    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
>  {
> -  rtx op0, op1;
> +  rtx op0, op1, mask;
>
>    if (<round_embedded_complex>)
>      emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
> @@ -6590,11 +6590,13 @@ (define_expand 
> "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
>    {
>      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
>      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> +    mask = gen_reg_rtx (QImode);
> +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
>    }
>    else
>    {
> -    rtx mask, tmp, vec_mask;
> +    rtx tmp, vec_mask;
>      mask = lowpart_subreg (SImode, operands[4], QImode),
>      tmp = gen_reg_rtx (SImode);
>      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> @@ -6631,7 +6633,7 @@ (define_expand 
> "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
>     (match_operand:QI 4 "register_operand")]
>    "TARGET_AVX512FP16 && <round_mode512bit_condition>"
>  {
> -  rtx op0, op1;
> +  rtx op0, op1, mask;
>
>    if (<round_embedded_complex>)
>      emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
> @@ -6645,11 +6647,13 @@ (define_expand 
> "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
>    {
>      op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
>      op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> -    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> +    mask = gen_reg_rtx (QImode);
> +    emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> +    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
Would it be better to use vmovss under avx512vl without & 1 for mask.
>    }
>    else
>    {
> -    rtx mask, tmp, vec_mask;
> +    rtx tmp, vec_mask;
>      mask = lowpart_subreg (SImode, operands[4], QImode),
>      tmp = gen_reg_rtx (SImode);
>      emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c 
> b/gcc/testsuite/gcc.target/i386/pr104978.c
> new file mode 100644
> index 00000000000..fd22a6c3f43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> @@ -0,0 +1,18 @@
> +/* PR target/104978 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> +
> +#include<immintrin.h>
> +
> +__m128h
> +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> +}
> +
> +__m128h
> +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> +{
> +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> +}
> --
> 2.18.1
>



-- 
BR,
Hongtao

Re: [PATCH] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

Reply via email to