Committed after fixing the comments. https://gcc.gnu.org/g:a79cf858b39e01c80537bc5d47a5e9004418c267
Thanks Gui Haochen 在 2023/8/14 15:47, Kewen.Lin 写道: > Hi Haochen, > > on 2023/8/14 10:18, HAO CHEN GUI wrote: >> Hi, >> This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx >> for all sub targets when the mode is V4SI and the extracted element is word >> 1 from BE order. Also this patch adds a insn pattern for mfvsrwz which >> helps eliminate redundant zero extend. >> >> Compared to last version, the main change is to put the word index >> checking in the split condition of "*vsx_extract_v4si_w023". Also modified >> some comments. >> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625380.html >> >> Bootstrapped and tested on powerpc64-linux BE and LE with no regressions. >> >> Thanks >> Gui Haochen >> >> ChangeLog >> rs6000: Generate mfvsrwz for all platform and remove redundant zero extend >> >> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x. So it should be >> generated even with p9 vector enabled. Also the instruction is already >> zero extended. A combine pattern is needed to eliminate redundant zero >> extend instructions. >> >> gcc/ >> PR target/106769 >> * config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only >> for V8HI and V16QI. >> (vsx_extract_v4si): New expand for V4SI extraction. >> (vsx_extract_v4si_w1): New insn pattern for V4SI extraction on >> word 1 from BE order. >> (*mfvsrwz): New insn pattern for mfvsrwz. >> (*vsx_extract_<mode>_di_p9): Assert that it won't be generated on >> word 1 from BE order. >> (*vsx_extract_si): Remove. >> (*vsx_extract_v4si_w023): New insn and split pattern on word 0, 2, >> 3 from BE order. >> >> gcc/testsuite/ >> PR target/106769 >> * gcc.target/powerpc/pr106769.h: New. >> * gcc.target/powerpc/pr106769-p8.c: New. >> * gcc.target/powerpc/pr106769-p9.c: New. >> >> patch.diff >> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md >> index 0a34ceebeb5..1cbdc2f1c01 100644 >> --- a/gcc/config/rs6000/vsx.md >> +++ b/gcc/config/rs6000/vsx.md >> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1" >> (define_expand "vsx_extract_<mode>" >> [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand") >> (vec_select:<VEC_base> >> - (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand") >> + (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand") >> (parallel [(match_operand:QI 2 "const_int_operand")]))) >> - (clobber (match_scratch:VSX_EXTRACT_I 3))])] >> + (clobber (match_scratch:VSX_EXTRACT_I2 3))])] >> "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT" >> { >> /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}. */ >> @@ -3736,6 +3736,63 @@ (define_expand "vsx_extract_<mode>" >> } >> }) >> >> +(define_expand "vsx_extract_v4si" >> + [(parallel [(set (match_operand:SI 0 "gpc_reg_operand") >> + (vec_select:SI >> + (match_operand:V4SI 1 "gpc_reg_operand") >> + (parallel [(match_operand:QI 2 "const_0_to_3_operand")]))) >> + (clobber (match_scratch:V4SI 3))])] >> + "TARGET_DIRECT_MOVE_64BIT" >> +{ >> + /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx. So just >> + fall through to vsx_extract_v4si_w1. */ >> + if (TARGET_P9_VECTOR >> + && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)) >> + { >> + emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1], >> + operands[2])); >> + DONE; >> + } >> +}) >> + >> +/* Extract from word 1 (BE order); */ > > Nit: I guessed I requested this before, please use ";" instead of > "/* ... */" for the comments, to align with the existing ones. > >> +(define_insn "vsx_extract_v4si_w1" >> + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa") >> + (vec_select:SI >> + (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0") >> + (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")]))) >> + (clobber (match_scratch:V4SI 3 "=v,v,v,v"))] >> + "TARGET_DIRECT_MOVE_64BIT >> + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" >> +{ >> + if (which_alternative == 0) >> + return "mfvsrwz %0,%x1"; >> + >> + if (which_alternative == 1) >> + return "xxlor %x0,%x1,%x1"; >> + >> + if (which_alternative == 2) >> + return "stxsiwx %x1,%y0"; >> + >> + return ASM_COMMENT_START " vec_extract to same register"; >> +} >> + [(set_attr "type" "mfvsr,veclogical,fpstore,*") >> + (set_attr "length" "4,4,4,0") >> + (set_attr "isa" "p8v,*,p8v,*")]) >> + >> +(define_insn "*mfvsrwz" >> + [(set (match_operand:DI 0 "register_operand" "=r") >> + (zero_extend:DI >> + (vec_select:SI >> + (match_operand:V4SI 1 "vsx_register_operand" "wa") >> + (parallel [(match_operand:QI 2 "const_int_operand" "n")])))) >> + (clobber (match_scratch:V4SI 3 "=v"))] >> + "TARGET_DIRECT_MOVE_64BIT >> + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" >> + "mfvsrwz %0,%x1" >> + [(set_attr "type" "mfvsr") >> + (set_attr "isa" "p8v")]) >> + >> (define_insn "vsx_extract_<mode>_p9" >> [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>") >> (vec_select:<VEC_base> >> @@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9" >> (parallel [(match_dup 2)]))) >> (clobber (match_dup 3))])] >> { >> + gcc_assert (<MODE>mode != V4SImode >> + || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)); >> + >> operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0])); >> } >> [(set_attr "isa" "p9v,*")]) >> @@ -3830,58 +3890,42 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9" >> (set (match_dup 0) >> (match_dup 3))]) >> >> -(define_insn_and_split "*vsx_extract_si" >> +/* Extract from word 0, 2, 3 (BE order); */ > > Ditto. > > Okay for trunk with these nits fixed, thanks! > > BR, > Kewen > >> +(define_insn_and_split "*vsx_extract_v4si_w023" >> [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z") >> (vec_select:SI >> (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v") >> (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))) >> (clobber (match_scratch:V4SI 3 "=v,v,v"))] >> - "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && >> !TARGET_P9_VECTOR" >> + "TARGET_DIRECT_MOVE_64BIT" >> "#" >> - "&& reload_completed" >> + "&& INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)" >> [(const_int 0)] >> { >> + gcc_assert (!TARGET_P9_VECTOR); >> + >> rtx dest = operands[0]; >> rtx src = operands[1]; >> rtx element = operands[2]; >> - rtx vec_tmp = operands[3]; >> - int value; >> + rtx vec_tmp; >> + >> + if (GET_CODE (operands[3]) == SCRATCH) >> + vec_tmp = gen_reg_rtx (V4SImode); >> + else >> + vec_tmp = operands[3]; >> >> /* Adjust index for LE element ordering, the below minuend 3 is computed >> by >> GET_MODE_NUNITS (V4SImode) - 1. */ >> if (!BYTES_BIG_ENDIAN) >> element = GEN_INT (3 - INTVAL (element)); >> >> - /* If the value is in the correct position, we can avoid doing the >> VSPLT<x> >> - instruction. */ >> - value = INTVAL (element); >> - if (value != 1) >> - emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); >> - else >> - vec_tmp = src; >> + emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); >> >> - if (MEM_P (operands[0])) >> - { >> - if (can_create_pseudo_p ()) >> - dest = rs6000_force_indexed_or_indirect_mem (dest); >> - >> - if (TARGET_P8_VECTOR) >> - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); >> - else >> - emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp)))); >> - } >> - >> - else if (TARGET_P8_VECTOR) >> - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); >> - else >> - emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)), >> - gen_rtx_REG (DImode, REGNO (vec_tmp))); >> + int value = BYTES_BIG_ENDIAN ? 1 : 2; >> + emit_insn (gen_vsx_extract_v4si_w1 (dest, vec_tmp, GEN_INT (value))); >> >> DONE; >> -} >> - [(set_attr "type" "mfvsr,vecperm,fpstore") >> - (set_attr "length" "8") >> - (set_attr "isa" "*,p8v,*")]) >> +}) >> >> (define_insn_and_split "*vsx_extract_<mode>_p8" >> [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r") >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c >> b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c >> new file mode 100644 >> index 00000000000..e7cdbc76298 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c >> @@ -0,0 +1,11 @@ >> +/* { dg-do compile } */ >> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ >> +/* { dg-require-effective-target powerpc_p8vector_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */ >> +/* { dg-require-effective-target has_arch_ppc64 } */ >> + >> +#include "pr106769.h" >> + >> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ >> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ >> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c >> b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c >> new file mode 100644 >> index 00000000000..2248b525d43 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c >> @@ -0,0 +1,13 @@ >> +/* { dg-do compile } */ >> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ >> +/* { dg-require-effective-target powerpc_p9vector_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */ >> +/* { dg-require-effective-target has_arch_ppc64 } */ >> + >> +#include "pr106769.h" >> + >> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ >> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ >> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ >> +/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */ >> +/* { dg-final { scan-assembler-not {\mvextuw[rl]x\M} } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769.h >> b/gcc/testsuite/gcc.target/powerpc/pr106769.h >> new file mode 100644 >> index 00000000000..1c8c8a024f3 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769.h >> @@ -0,0 +1,17 @@ >> +#include <altivec.h> >> + >> +#ifdef __BIG_ENDIAN__ >> +#define LANE 1 >> +#else >> +#define LANE 2 >> +#endif >> + >> +unsigned int foo1 (vector unsigned int v) >> +{ >> + return vec_extract(v, LANE); >> +} >> + >> +void foo2 (vector unsigned int v, unsigned int* p) >> +{ >> + *p = vec_extract(v, LANE); >> +}