On 2020/7/23 04:30, Richard Sandiford wrote:
> 
> I now realise the reason is that the starting mode is too wide.
> I think we should fix that by doing:
> 
>    FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>      {
>        …
> 
> and then add:
> 
>        if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
>          continue;
> 
> after your optimisation, so that the shift code proper still only
> considers modes that are wide enough to hold the unshifted value.
> 
> I don't think there are any efficiency concerns with that, since
> smallest_int_mode_for_size does its own similar iteration internally.
> 
> Sorry for not picking up on that first time.
> 

Thanks:), I didn't make it clear that it starts from TImode first...

The updated patch use "FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)"
to iterate from QImode now, but still need size "new_mode <= store_mode"
to get legal subreg, otherwise "gfortran.dg/shift-kind.f90" will fail.
Even so, there are still below execute fail when running regression tests:

gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fbounds-check
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions -funroll-loops
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O3 -g
gfortran.fortran-torture/execute/equiv_2.f90 execution, -O2 -ftree-vectorize 
-maltivec 

Any clue about the execution outputs "STOP 2", please? Still investigating.

PS: if switch the sequence of GET_MODE_BITSIZE (new_mode) and shift in 
multiple_p,
it will generates incorrect ASM for PR71309.


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-23  Xionghu Luo  <luo...@linux.ibm.com>

        PR rtl-optimization/71309
        * dse.c (find_shift_sequence): Use subreg of shifted from high part
        register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-23  Xionghu Luo  <luo...@linux.ibm.com>

        PR rtl-optimization/71309
        * gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 22 +++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..afc6ad30623 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
      the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
-                     smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1779,6 +1778,25 @@ find_shift_sequence (poly_int64 access_size,
          && !targetm.modes_tieable_p (new_mode, store_mode))
        continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+         && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+       {
+         /* Try to implement the shift using a subreg.  */
+         poly_int64 offset
+           = subreg_offset_from_lsb (new_mode, store_mode, shift);
+         rtx rhs_subreg
+           = simplify_gen_subreg (new_mode, store_info->rhs, store_mode, 
offset);
+         if (rhs_subreg)
+           {
+             read_reg
+               = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+             break;
+           }
+       }
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+       continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c

Reply via email to