On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubiz...@gmail.com> wrote: > > On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao....@intel.com> wrote: > > > > Similarly optimize movl + vmovq to vmovd. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR target/104915 > > * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New > > pre_reload define_insn_and_split. > > (*vec_setv2di_0_zero_extendhi_1): Ditto. > > (*vec_set<mode>_0_zero_extendsi): Ditto. > > (*vec_setv2di_0_zero_extendsi_1): Ditto. > > (ssewvecmode): New mode attr. > > (ssewvecmodelower): Ditto. > > (ssepackmodelower): Ditto. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr104915-vmovd.c: New test. > > * gcc.target/i386/pr104915-vmovw.c: New test. > > I wonder if these define_insn_and_splits can instead be implemented > via combine splitter (which has the unfortunate limitation that the > output sequence has to be exactly two instructions, which is true in > your case). Combine splitter is preferred, since it splits immediately > and the resulting insns can be combined further during the combine > pass.
try_combine requires at least 3 insns to go into combine_split_insns, here we just have 2 insns and failed. -----cut from combine.cc-------- 3545 /* If we were combining three insns and the result is a simple SET 3546 with no ASM_OPERANDS that wasn't recognized, try to split it into two 3547 insns. There are two ways to do this. It can be split using a 3548 machine-specific method (like when you have an addition of a large 3549 constant) or by combine in the function find_split_point. */ 3550 3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET 3552 && asm_noperands (newpat) < 0) -------cut end------------- > > Uros. > > > --- > > gcc/config/i386/sse.md | 94 +++++++++++++++++++ > > .../gcc.target/i386/pr104915-vmovd.c | 25 +++++ > > .../gcc.target/i386/pr104915-vmovw.c | 45 +++++++++ > > 3 files changed, 164 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index 7b791def542..2ad8a2b46b8 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode > > (V32HI "V32HI") (V64QI "V64QI") > > (V32QI "V32QI") (V16QI "V16QI")]) > > > > +;; Mapping of vector modes to an V*HImode of the same size > > +(define_mode_attr ssewvecmode > > + [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI") > > + (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")]) > > + > > +(define_mode_attr ssewvecmodelower > > + [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi") > > + (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")]) > > + > > (define_mode_attr sseintvecmode2 > > [(V8DF "XI") (V4DF "OI") (V2DF "TI") > > (V8SF "OI") (V4SF "TI") > > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode > > (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI") > > (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")]) > > > > +(define_mode_attr ssepackmodelower > > + [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si") > > + (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si") > > + (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")]) > > + > > ;; Mapping of the max integer size for xop rotate immediate constraint > > (define_mode_attr sserotatemax > > [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")]) > > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0" > > (set_attr "prefix" "evex") > > (set_attr "mode" "HF")]) > > > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi" > > + [(set (match_operand:VI48_AVX512F 0 "register_operand") > > + (vec_merge:VI48_AVX512F > > + (vec_duplicate:VI48_AVX512F > > + (zero_extend:<ssescalarmode> > > + (match_operand:HI 1 "nonimmediate_operand"))) > > + (match_operand:VI48_AVX512F 2 "const0_operand") > > + (const_int 1)))] > > + "TARGET_AVX512FP16 && ix86_pre_reload_split ()" > > + "#" > > + "&& 1" > > + [(const_int 0)] > > +{ > > + rtx dest = gen_reg_rtx (<ssewvecmode>mode); > > + emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest, > > + CONST0_RTX > > (<ssewvecmode>mode), > > + operands[1])); > > + emit_move_insn (operands[0], > > + lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode)); > > + DONE; > > +}) > > + > > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1" > > + [(set (match_operand:V2DI 0 "register_operand") > > + (vec_concat:V2DI > > + (zero_extend:DI > > + (match_operand:HI 1 "nonimmediate_operand")) > > + (const_int 0)))] > > + "TARGET_AVX512FP16 && ix86_pre_reload_split ()" > > + "#" > > + "&& 1" > > + [(const_int 0)] > > +{ > > + rtx dest = gen_reg_rtx (V8HImode); > > + emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1])); > > + emit_move_insn (operands[0], > > + lowpart_subreg (V2DImode, dest, V8HImode)); > > + DONE; > > +}) > > + > > (define_insn "avx512fp16_movsh" > > [(set (match_operand:V8HF 0 "register_operand" "=v") > > (vec_merge:V8HF > > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0" > > ] > > (symbol_ref "true")))]) > > > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi" > > + [(set (match_operand:VI8 0 "register_operand") > > + (vec_merge:VI8 > > + (vec_duplicate:VI8 > > + (zero_extend:DI > > + (match_operand:SI 1 "nonimmediate_operand"))) > > + (match_operand:VI8 2 "const0_operand") > > + (const_int 1)))] > > + "TARGET_SSE2 && ix86_pre_reload_split ()" > > + "#" > > + "&& 1" > > + [(const_int 0)] > > +{ > > + rtx dest = gen_reg_rtx (<ssepackmode>mode); > > + emit_insn (gen_vec_set<ssepackmodelower>_0 (dest, > > + CONST0_RTX > > (<ssepackmode>mode), > > + operands[1])); > > + emit_move_insn (operands[0], > > + lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode)); > > + DONE; > > +}) > > + > > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1" > > + [(set (match_operand:V2DI 0 "register_operand") > > + (vec_concat:V2DI > > + (zero_extend:DI > > + (match_operand:SI 1 "nonimmediate_operand")) > > + (const_int 0)))] > > + "TARGET_SSE2 && ix86_pre_reload_split ()" > > + "#" > > + "&& 1" > > + [(const_int 0)] > > +{ > > + rtx dest = gen_reg_rtx (V4SImode); > > + emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1])); > > + emit_move_insn (operands[0], > > + lowpart_subreg (V2DImode, dest, V4SImode)); > > + DONE; > > +}) > > + > > (define_insn "sse4_1_insertps" > > [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v") > > (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" > > "Yrm,*xm,vm") > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c > > b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c > > new file mode 100644 > > index 00000000000..913ff8806f1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c > > @@ -0,0 +1,25 @@ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-mavx512f -O2" } */ > > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */ > > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */ > > + > > +#include<immintrin.h> > > + > > +__m128i > > +foo1 (int* p) > > +{ > > + return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0])); > > +} > > + > > +__m256i > > +foo3 (int* p) > > +{ > > + return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0])); > > +} > > + > > +__m512i > > +foo5 (int* p) > > +{ > > + return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0, > > + (unsigned int) ((*(__m32_u *)p)[0])); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c > > b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c > > new file mode 100644 > > index 00000000000..ac47865d17a > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c > > @@ -0,0 +1,45 @@ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-mavx512fp16 -O2" } */ > > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */ > > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */ > > + > > +#include<immintrin.h> > > +__m128i > > +foo (short* p) > > +{ > > + return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0])); > > +} > > + > > +__m128i > > +foo1 (short* p) > > +{ > > + return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0])); > > +} > > + > > +__m256i > > +foo2 (short* p) > > +{ > > + return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0, > > + (unsigned short) ((*(__m16_u *)p)[0])); > > +} > > + > > +__m256i > > +foo3 (short* p) > > +{ > > + return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u > > *)p)[0])); > > +} > > + > > +__m512i > > +foo4 (short* p) > > +{ > > + return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, > > + (unsigned short) ((*(__m16_u *)p)[0])); > > +} > > + > > +__m512i > > +foo5 (short* p) > > +{ > > + return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0, > > + (unsigned short) ((*(__m16_u *)p)[0])); > > +} > > -- > > 2.18.1 > > -- BR, Hongtao