On Fri, 2022-05-13 at 12:13 -0400, Michael Meissner wrote: > Add zero_extendditi2. Improve lxvr*x code generation. >
Content here matches what I commented on in the prior email with subject "Delay splitting addti3...". > This pattern adds zero_extendditi2 so that if we are extending DImode > that > is in a GPR register to TImode in a vector register, the compiler can > generate MTVSRDDD. > > In addition the patterns for generating lxvr{b,h,w,d}x were tuned to > allow > loading to gpr registers. This prevents needlessly doing direct > moves to > get the value into the vector registers if the gpr register was > already > selected. > > In updating the insn counts for two tests due to these changes, I > noticed > the tests were done at -O0. I changed this so that the tests are now > done > at the normal -O2 optimization level. > > This patch will be needed for an upcoming patch for PR target/103109. > > I have built this patch on little endian power10, little endian > power9, > and big endian power8 systems. There were no regressions with this > patch. Can I install this on the GCC 13 trunk? > > 2022-05-013 Michael Meissner <meiss...@linux.ibm.com> > > gcc/ > * config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading > to > GPR registers. > (vsx_stxvr<wd>x): Add support for storing from GPR registers. > (zero_extendditi2): New insn. > > gcc/testsuite/ > * gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2 > instead of -O0 and update insn counts. > * gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise. > * gcc.target/powerpc/zero-extend-di-ti.c: New test. > --- > gcc/config/rs6000/vsx.md | 82 > +++++++++++++++++-- > .../powerpc/vsx-load-element-extend-int.c | 36 ++++---- > .../powerpc/vsx-load-element-extend-short.c | 35 ++++---- > .../gcc.target/powerpc/zero-extend-di-ti.c | 62 ++++++++++++++ > 4 files changed, 164 insertions(+), 51 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di- > ti.c > > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index c091e5e2f47..ad971e3a1de 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>" > } > }) > > -;; Load rightmost element from load_data > -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx. > -(define_insn "vsx_lxvr<wd>x" > - [(set (match_operand:TI 0 "vsx_register_operand" "=wa") > - (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" > "Z")))] > - "TARGET_POWER10" > - "lxvr<wd>x %x0,%y1" > - [(set_attr "type" "vecload")]) > +;; Load rightmost element from load_data using lxvrbx, lxvrhx, > lxvrwx, lxvrdx. > +;; Support TImode being in a GPR register to prevent generating > lvxr{d,w,b}x > +;; and then two direct moves if we ultimately need the value in a > GPR register. > +(define_insn_and_split "vsx_lxvr<wd>x" > + [(set (match_operand:TI 0 "register_operand" "=r,wa") > + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" > "m,Z")))] > + "TARGET_POWERPC64 && TARGET_POWER10" > + "@ > + # > + lxvr<wd>x %x0,%y1" > + "&& reload_completed && int_reg_operand (operands[0], TImode)" > + [(set (match_dup 2) (match_dup 3)) > + (set (match_dup 4) (const_int 0))] > +{ > + rtx op0 = operands[0]; > + rtx op1 = operands[1]; > + > + operands[2] = gen_lowpart (DImode, op0); > + operands[3] = (<MODE>mode == DImode > + ? op1 > + : gen_rtx_ZERO_EXTEND (DImode, op1)); > + > + operands[4] = gen_highpart (DImode, op0); > +} > + [(set_attr "type" "load,vecload") > + (set_attr "num_insns" "2,*")]) > > ;; Store rightmost element into store_data > ;; using stxvrbx, stxvrhx, strvxwx, strvxdx. > @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di" > DONE; > }) > > +;; Zero extend DI to TI. If we don't have the MTVSRDD instruction > (and LXVRDX > +;; in the case of power10), we use the machine independent code. If > we are > +;; loading up GPRs, we fall back to the old code. > +(define_insn_and_split "zero_extendditi2" > + [(set (match_operand:TI 0 > "register_operand" "=r,r, wa,&wa") > + (zero_extend:TI (match_operand:DI 1 > "register_operand" "r,wa,r, wa")))] > + "TARGET_POWERPC64 && TARGET_P9_VECTOR" > + "@ > + # > + # > + mtvsrdd %x0,0,%1 > + #" > + "&& reload_completed > + && (int_reg_operand (operands[0], TImode) > + || vsx_register_operand (operands[1], DImode))" > + [(pc)] > +{ > + rtx dest = operands[0]; > + rtx src = operands[1]; > + int dest_regno = reg_or_subregno (dest); > + > + /* Handle conversion to GPR registers. Load up the low part and > then do > + zero out the upper part. */ > + if (INT_REGNO_P (dest_regno)) > + { > + rtx dest_hi = gen_highpart (DImode, dest); > + rtx dest_lo = gen_lowpart (DImode, dest); > + > + emit_move_insn (dest_lo, src); > + emit_move_insn (dest_hi, const0_rtx); > + DONE; > + } > + > + /* For settomg a VSX register from another VSX register, clear the > result > + register, and use XXPERMDI to shift the value into the lower > 64-bits. */ > + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno); > + rtx dest_di = gen_rtx_REG (DImode, dest_regno); > + > + emit_move_insn (dest_v2di, CONST0_RTX (V2DImode)); > + if (BYTES_BIG_ENDIAN) > + emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src)); > + else > + emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di)); > + DONE; > +} > + [(set_attr "type" "integer,mfvsr,vecmove,vecperm") > + (set_attr "length" "8, 8, *, 8")]) > + > ;; Sign extend DI to TI. We provide both GPR targets and Altivec > targets on > ;; power10. On earlier systems, the machine independent code will > generate a > ;; shift left to sign extend the 64-bit value to 128-bit. > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element- > extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element- > extend-int.c > index c40e1a3a0f7..1f1281d6b75 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > @@ -6,33 +6,25 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization > levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > - > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ > /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */ > > #define NUM_VEC_ELEMS 4 > #define ITERS 16 > > -/* > -Codegen at time of writing is a single lxvrwx for the zero > -extended test, and a lwax,mtvsrdd,vextsd2q for the sign > -extended test. > - > -0000000010000c90 <test_sign_extended_load>: > - 10000c90: aa 1a 24 7d lwax r9,r4,r3 > - 10000c94: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10000c98: 02 16 5b 10 vextsd2q v2,v2 > - 10000c9c: 20 00 80 4e blr > - > -0000000010000cb0 <test_zero_extended_unsigned_load>: > - 10000cb0: 9b 18 44 7c lxvrwx vs34,r4,r3 > - 10000cb4: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero > extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign > extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 4: 02 16 5a 10 vextsw2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 24: 20 00 80 4e blr */ > > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element- > extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element- > extend-short.c > index 837ba79c9ab..a7721318812 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend- > short.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend- > short.c > @@ -6,33 +6,26 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization > levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ > > /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */ > > #define NUM_VEC_ELEMS 8 > #define ITERS 16 > > -/* > -Codegen at time of writing uses lxvrhx for the zero > -extension test and lhax,mtvsrdd,vextsd2q for the > -sign extended test. > - > -0000000010001810 <test_sign_extended_load>: > - 10001810: ae 1a 24 7d lhax r9,r4,r3 > - 10001814: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10001818: 02 16 5b 10 vextsd2q v2,v2 > - 1000181c: 20 00 80 4e blr > - > -0000000010001830 <test_zero_extended_unsigned_load>: > - 10001830: 5b 18 44 7c lxvrhx vs34,r4,r3 > - 10001834: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero > extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign > extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 4: 02 16 59 10 vextsh2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 24: 20 00 80 4e blr */ > > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > new file mode 100644 > index 00000000000..9b3b9c4dbd0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > @@ -0,0 +1,62 @@ > +/* { dg-require-effective-target int128 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +/* This patch makes sure the various optimization and code paths are > done for > + zero extending DImode to TImode on power10. */ > + > +__uint128_t > +gpr_to_gpr (unsigned long long a) > +{ > + /* li 4,0. */ > + return a; > +} > + > +__uint128_t > +mem_to_gpr (unsigned long long *p) > +{ > + /* ld 3,0(3); li 4,0. */ > + return *p; > +} > + > +__uint128_t > +vsx_to_gpr (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; li 4,0;mfvsrd 3,1. */ > + return (unsigned long long)d; > +} > + > +void > +gpr_to_vsx (__uint128_t *p, unsigned long long a) > +{ > + /* mtvsrdd 0,0,4; stxv 0,0(3). */ > + __uint128_t b = a; > + __asm__ (" # %x0" : "+wa" (b)); > + *p = b; > +} > + > +void > +mem_to_vsx (__uint128_t *p, unsigned long long *q) > +{ > + /* lxvrdx 0,0,4; stxv 0,0(3). */ > + __uint128_t a = *q; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} > + > +void > +vsx_to_vsx (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3). */ > + __uint128_t a = (unsigned long long)d; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} > + > +/* { dg-final { scan-assembler-times {\mli\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mld\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mstxv\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ > -- > 2.35.3 > >