> > > Would it be better to use vmovss under avx512vl without & 1 for mask. > > > > vmovss clears the upper bits, but the intrinsic requires src1. We > > still need either a mask move or blend for the high part. > not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b) > https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838
Oh, if this works, the non-avx512vl part could also be adjusted. Will try this, thanks. Hongtao Liu <crazy...@gmail.com> 于2022年3月21日周一 09:48写道: > > On Mon, Mar 21, 2022 at 9:22 AM Hongyu Wang <wwwhhhyyy...@gmail.com> wrote: > > > > > Would it be better to use vmovss under avx512vl without & 1 for mask. > > > > vmovss clears the upper bits, but the intrinsic requires src1. We > > still need either a mask move or blend for the high part. > not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b) > https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838 > > > > LLVM generates mask & 1 for these intrinsics. > > > > Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 09:08写道: > > > > > > On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > Hi, > > > > > > > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the > > > > mask should be and by 1 to ensure the mask is bind to lowest byte. > > > > > > > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. > > > > > > > > Ok for master? > > > > > > > > gcc/ChangeLog: > > > > > > > > PR target/104978 > > > > * config/i386/sse.md > > > > (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name): > > > > Generate mask & 1 before move to dest under TARGET_AVX512VL. > > > > (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR target/104978 > > > > * gcc.target/i386/pr104978.c: New test. > > > > --- > > > > gcc/config/i386/sse.md | 16 ++++++++++------ > > > > gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++++++++++++++ > > > > 2 files changed, 28 insertions(+), 6 deletions(-) > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c > > > > > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > > > index ed98120be59..cc4c5542ee6 100644 > > > > --- a/gcc/config/i386/sse.md > > > > +++ b/gcc/config/i386/sse.md > > > > @@ -6576,7 +6576,7 @@ (define_expand > > > > "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > > > > (match_operand:QI 4 "register_operand")] > > > > "TARGET_AVX512FP16 && <round_mode512bit_condition>" > > > > { > > > > - rtx op0, op1; > > > > + rtx op0, op1, mask; > > > > > > > > if (<round_embedded_complex>) > > > > emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> ( > > > > @@ -6590,11 +6590,13 @@ (define_expand > > > > "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > > > > { > > > > op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > > > op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > > > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, > > > > operands[4])); > > > > + mask = gen_reg_rtx (QImode); > > > > + emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1))); > > > > + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask)); > > > > } > > > > else > > > > { > > > > - rtx mask, tmp, vec_mask; > > > > + rtx tmp, vec_mask; > > > > mask = lowpart_subreg (SImode, operands[4], QImode), > > > > tmp = gen_reg_rtx (SImode); > > > > emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > > > > @@ -6631,7 +6633,7 @@ (define_expand > > > > "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > > > > (match_operand:QI 4 "register_operand")] > > > > "TARGET_AVX512FP16 && <round_mode512bit_condition>" > > > > { > > > > - rtx op0, op1; > > > > + rtx op0, op1, mask; > > > > > > > > if (<round_embedded_complex>) > > > > emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> ( > > > > @@ -6645,11 +6647,13 @@ (define_expand > > > > "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > > > > { > > > > op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > > > op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > > > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, > > > > operands[4])); > > > > + mask = gen_reg_rtx (QImode); > > > > + emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1))); > > > > + emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask)); > > > Would it be better to use vmovss under avx512vl without & 1 for mask. > > > > } > > > > else > > > > { > > > > - rtx mask, tmp, vec_mask; > > > > + rtx tmp, vec_mask; > > > > mask = lowpart_subreg (SImode, operands[4], QImode), > > > > tmp = gen_reg_rtx (SImode); > > > > emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c > > > > b/gcc/testsuite/gcc.target/i386/pr104978.c > > > > new file mode 100644 > > > > index 00000000000..fd22a6c3f43 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c > > > > @@ -0,0 +1,18 @@ > > > > +/* PR target/104978 */ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ > > > > +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */ > > > > + > > > > +#include<immintrin.h> > > > > + > > > > +__m128h > > > > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m) > > > > +{ > > > > + return _mm_mask_fmadd_round_sch (a, m, b, c, 8); > > > > +} > > > > + > > > > +__m128h > > > > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m) > > > > +{ > > > > + return _mm_mask_fcmadd_round_sch (a, m, b, c, 8); > > > > +} > > > > -- > > > > 2.18.1 > > > > > > > > > > > > > -- > > > BR, > > > Hongtao > > > > -- > BR, > Hongtao