On Wed, Nov 2, 2022 at 8:59 AM Kewen.Lin <li...@linux.ibm.com> wrote: > > Hi, > > As the test case in PR107412 shows, we can fold IFN .LEN_{LOAD, > STORE} into normal vector load/store if the given length is known > to be equal to the length of the whole vector. It would help to > improve overall cycles as normally the latency of vector access > with length in bytes is bigger than normal vector access, and it > also saves the preparation for length if constant length can not > be encoded into instruction (such as on power). > > Bootstrapped and regtested on x86_64-redhat-linux, > aarch64-linux-gnu and powerpc64{,le}-linux-gnu. > > Is it ok for trunk?
OK. > > BR, > Kewen > ----- > PR tree-optimization/107412 > > gcc/ChangeLog: > > * gimple-fold.cc (gimple_fold_mask_load_store_mem_ref): Rename to ... > (gimple_fold_partial_load_store_mem_ref): ... this, add one parameter > mask_p indicating it's for mask or length, and add some handlings for > IFN LEN_{LOAD,STORE}. > (gimple_fold_mask_load): Rename to ... > (gimple_fold_partial_load): ... this, add one parameter mask_p. > (gimple_fold_mask_store): Rename to ... > (gimple_fold_partial_store): ... this, add one parameter mask_p. > (gimple_fold_call): Add the handlings for IFN LEN_{LOAD,STORE}, > and adjust calls on gimple_fold_mask_load_store_mem_ref to > gimple_fold_partial_load_store_mem_ref. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/pr107412.c: New test. > * gcc.target/powerpc/p9-vec-length-epil-8.c: Adjust scan times for > folded LEN_LOAD. > --- > gcc/gimple-fold.cc | 57 ++++++++++++++----- > .../gcc.target/powerpc/p9-vec-length-epil-8.c | 2 +- > gcc/testsuite/gcc.target/powerpc/pr107412.c | 19 +++++++ > 3 files changed, 64 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr107412.c > > diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc > index a1704784bc9..e3a087defa6 100644 > --- a/gcc/gimple-fold.cc > +++ b/gcc/gimple-fold.cc > @@ -5370,19 +5370,39 @@ arith_overflowed_p (enum tree_code code, const_tree > type, > return wi::min_precision (wres, sign) > TYPE_PRECISION (type); > } > > -/* If IFN_MASK_LOAD/STORE call CALL is unconditional, return a MEM_REF > +/* If IFN_{MASK,LEN}_LOAD/STORE call CALL is unconditional, return a MEM_REF > for the memory it references, otherwise return null. VECTYPE is the > - type of the memory vector. */ > + type of the memory vector. MASK_P indicates it's for MASK if true, > + otherwise it's for LEN. */ > > static tree > -gimple_fold_mask_load_store_mem_ref (gcall *call, tree vectype) > +gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool > mask_p) > { > tree ptr = gimple_call_arg (call, 0); > tree alias_align = gimple_call_arg (call, 1); > - tree mask = gimple_call_arg (call, 2); > - if (!tree_fits_uhwi_p (alias_align) || !integer_all_onesp (mask)) > + if (!tree_fits_uhwi_p (alias_align)) > return NULL_TREE; > > + if (mask_p) > + { > + tree mask = gimple_call_arg (call, 2); > + if (!integer_all_onesp (mask)) > + return NULL_TREE; > + } else { > + tree basic_len = gimple_call_arg (call, 2); > + if (!tree_fits_uhwi_p (basic_len)) > + return NULL_TREE; > + unsigned int nargs = gimple_call_num_args (call); > + tree bias = gimple_call_arg (call, nargs - 1); > + gcc_assert (tree_fits_uhwi_p (bias)); > + tree biased_len = int_const_binop (MINUS_EXPR, basic_len, bias); > + unsigned int len = tree_to_uhwi (biased_len); > + unsigned int vect_len > + = GET_MODE_SIZE (TYPE_MODE (vectype)).to_constant (); > + if (vect_len != len) > + return NULL_TREE; > + } > + > unsigned HOST_WIDE_INT align = tree_to_uhwi (alias_align); > if (TYPE_ALIGN (vectype) != align) > vectype = build_aligned_type (vectype, align); > @@ -5390,16 +5410,18 @@ gimple_fold_mask_load_store_mem_ref (gcall *call, > tree vectype) > return fold_build2 (MEM_REF, vectype, ptr, offset); > } > > -/* Try to fold IFN_MASK_LOAD call CALL. Return true on success. */ > +/* Try to fold IFN_{MASK,LEN}_LOAD call CALL. Return true on success. > + MASK_P indicates it's for MASK if true, otherwise it's for LEN. */ > > static bool > -gimple_fold_mask_load (gimple_stmt_iterator *gsi, gcall *call) > +gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool > mask_p) > { > tree lhs = gimple_call_lhs (call); > if (!lhs) > return false; > > - if (tree rhs = gimple_fold_mask_load_store_mem_ref (call, TREE_TYPE (lhs))) > + if (tree rhs > + = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (lhs), > mask_p)) > { > gassign *new_stmt = gimple_build_assign (lhs, rhs); > gimple_set_location (new_stmt, gimple_location (call)); > @@ -5410,13 +5432,16 @@ gimple_fold_mask_load (gimple_stmt_iterator *gsi, > gcall *call) > return false; > } > > -/* Try to fold IFN_MASK_STORE call CALL. Return true on success. */ > +/* Try to fold IFN_{MASK,LEN}_STORE call CALL. Return true on success. > + MASK_P indicates it's for MASK if true, otherwise it's for LEN. */ > > static bool > -gimple_fold_mask_store (gimple_stmt_iterator *gsi, gcall *call) > +gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call, > + bool mask_p) > { > tree rhs = gimple_call_arg (call, 3); > - if (tree lhs = gimple_fold_mask_load_store_mem_ref (call, TREE_TYPE (rhs))) > + if (tree lhs > + = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), > mask_p)) > { > gassign *new_stmt = gimple_build_assign (lhs, rhs); > gimple_set_location (new_stmt, gimple_location (call)); > @@ -5634,10 +5659,16 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool > inplace) > cplx_result = true; > break; > case IFN_MASK_LOAD: > - changed |= gimple_fold_mask_load (gsi, stmt); > + changed |= gimple_fold_partial_load (gsi, stmt, true); > break; > case IFN_MASK_STORE: > - changed |= gimple_fold_mask_store (gsi, stmt); > + changed |= gimple_fold_partial_store (gsi, stmt, true); > + break; > + case IFN_LEN_LOAD: > + changed |= gimple_fold_partial_load (gsi, stmt, false); > + break; > + case IFN_LEN_STORE: > + changed |= gimple_fold_partial_store (gsi, stmt, false); > break; > default: > break; > diff --git a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c > b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c > index 961df0d5646..8b9c9107814 100644 > --- a/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c > +++ b/gcc/testsuite/gcc.target/powerpc/p9-vec-length-epil-8.c > @@ -8,5 +8,5 @@ > > #include "p9-vec-length-8.h" > > -/* { dg-final { scan-assembler-times {\mlxvl\M} 21 } } */ > +/* { dg-final { scan-assembler-times {\mlxvl\M} 16 } } */ > /* { dg-final { scan-assembler-times {\mstxvl\M} 7 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr107412.c > b/gcc/testsuite/gcc.target/powerpc/pr107412.c > new file mode 100644 > index 00000000000..4526ea8639d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr107412.c > @@ -0,0 +1,19 @@ > +/* { dg-require-effective-target powerpc_p9vector_ok } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-options "-mdejagnu-cpu=power9 -O2 -ftree-vectorize > -fno-vect-cost-model -funroll-loops -fno-tree-loop-distribute-patterns > --param vect-partial-vector-usage=2 -fdump-tree-optimized" } */ > + > +/* Verify there is only one IFN call LEN_LOAD and IFN_STORE separately. */ > + > +#define N 16 > +int src[N]; > +int dest[N]; > + > +void > +foo () > +{ > + for (int i = 0; i < (N - 1); i++) > + dest[i] = src[i]; > +} > + > +/* { dg-final { scan-tree-dump-times {\mLEN_LOAD\M} 1 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times {\mLEN_STORE\M} 1 "optimized" } } */ > -- > 2.27.0