On Fri, 11 Aug 2023, Juzhe-Zhong wrote: > This patch is add vec_mask_len_{load_lanes,store_stores} autovectorization > patterns. > > Here we want to support this following autovectorization: > > #include <stdint.h> > void > foo (int8_t *__restrict a, > int8_t *__restrict b, > int8_t *__restrict cond, > int n) > { > for (intptr_t i = 0; i < n; ++i) > { > if (cond[i]) > a[i] = b[i * 2] + b[i * 2 + 1]; > } > } > > ARM SVE IR: > > https://godbolt.org/z/cro1Eqc6a > > # loop_mask_60 = PHI <next_mask_82(4), max_mask_81(3)> > ... > mask__39.12_63 = vect__3.11_61 != { 0, ... }; > vec_mask_and_66 = loop_mask_60 & mask__39.12_63; > ... > vect_array.15 = .MASK_LOAD_LANES (_57, 8B, vec_mask_and_66); > ... > > For RVV, we would like to see IR: > > loop_len = SELECT_VL; > ... > mask__39.12_63 = vect__3.11_61 != { 0, ... }; > ... > vect_array.15 = .MASK_LEN_LOAD_LANES (_57, 8B, mask__39.12_63, loop_len, > bias); > ... > > Bootstrap and Regression on X86 passed. > > Ok for trunk ?
LGTM. > gcc/ChangeLog: > > * doc/md.texi: Add vec_mask_len_{load_lanes,store_lanes} patterns. > * internal-fn.cc (expand_partial_load_optab_fn): Ditto. > (expand_partial_store_optab_fn): Ditto. > * internal-fn.def (MASK_LEN_LOAD_LANES): Ditto. > (MASK_LEN_STORE_LANES): Ditto. > * optabs.def (OPTAB_CD): Ditto. > > --- > gcc/doc/md.texi | 34 ++++++++++++++++++++++++++++++++++ > gcc/internal-fn.cc | 6 ++++-- > gcc/internal-fn.def | 6 ++++++ > gcc/optabs.def | 2 ++ > 4 files changed, 46 insertions(+), 2 deletions(-) > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi > index 9693b6bfe79..70590e68ffe 100644 > --- a/gcc/doc/md.texi > +++ b/gcc/doc/md.texi > @@ -4978,6 +4978,23 @@ for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++) > > This pattern is not allowed to @code{FAIL}. > > +@cindex @code{vec_mask_len_load_lanes@var{m}@var{n}} instruction pattern > +@item @samp{vec_mask_len_load_lanes@var{m}@var{n}} > +Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional > +mask operand (operand 2), length operand (operand 3) as well as bias operand > (operand 4) > +that specifies which elements of the destination vectors should be loaded. > +Other elements of the destination vectors are undefined. The operation is > equivalent to: > + > +@smallexample > +int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n}); > +for (j = 0; j < operand3 + operand4; j++) > + if (operand2[j]) > + for (i = 0; i < c; i++) > + operand0[i][j] = operand1[j * c + i]; > +@end smallexample > + > +This pattern is not allowed to @code{FAIL}. > + > @cindex @code{vec_store_lanes@var{m}@var{n}} instruction pattern > @item @samp{vec_store_lanes@var{m}@var{n}} > Equivalent to @samp{vec_load_lanes@var{m}@var{n}}, with the memory > @@ -5011,6 +5028,23 @@ for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++) > > This pattern is not allowed to @code{FAIL}. > > +@cindex @code{vec_mask_len_store_lanes@var{m}@var{n}} instruction pattern > +@item @samp{vec_mask_len_store_lanes@var{m}@var{n}} > +Like @samp{vec_store_lanes@var{m}@var{n}}, but takes an additional > +mask operand (operand 2), length operand (operand 3) as well as bias operand > (operand 4) > +that specifies which elements of the source vectors should be stored. > +The operation is equivalent to: > + > +@smallexample > +int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n}); > +for (j = 0; j < operand3 + operand4; j++) > + if (operand2[j]) > + for (i = 0; i < c; i++) > + operand0[j * c + i] = operand1[i][j]; > +@end smallexample > + > +This pattern is not allowed to @code{FAIL}. > + > @cindex @code{gather_load@var{m}@var{n}} instruction pattern > @item @samp{gather_load@var{m}@var{n}} > Load several separate memory locations into a vector of mode @var{m}. > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 7f5ede00c02..4f2b20a79e5 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -2931,7 +2931,8 @@ expand_partial_load_optab_fn (internal_fn ifn, gcall > *stmt, convert_optab optab) > type = TREE_TYPE (lhs); > rhs = expand_call_mem_ref (type, stmt, 0); > > - if (optab == vec_mask_load_lanes_optab) > + if (optab == vec_mask_load_lanes_optab > + || optab == vec_mask_len_load_lanes_optab) > icode = get_multi_vector_move (type, optab); > else if (optab == len_load_optab) > icode = direct_optab_handler (optab, TYPE_MODE (type)); > @@ -2973,7 +2974,8 @@ expand_partial_store_optab_fn (internal_fn ifn, gcall > *stmt, convert_optab optab > type = TREE_TYPE (rhs); > lhs = expand_call_mem_ref (type, stmt, 0); > > - if (optab == vec_mask_store_lanes_optab) > + if (optab == vec_mask_store_lanes_optab > + || optab == vec_mask_len_store_lanes_optab) > icode = get_multi_vector_move (type, optab); > else if (optab == len_store_optab) > icode = direct_optab_handler (optab, TYPE_MODE (type)); > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index b3c410f4b6a..a04d2b36319 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -50,6 +50,7 @@ along with GCC; see the file COPYING3. If not see > - mask_load: currently just maskload > - load_lanes: currently just vec_load_lanes > - mask_load_lanes: currently just vec_mask_load_lanes > + - mask_len_load_lanes: currently just vec_mask_len_load_lanes > - gather_load: used for {mask_,mask_len_,}gather_load > - len_load: currently just len_load > - mask_len_load: currently just mask_len_load > @@ -57,6 +58,7 @@ along with GCC; see the file COPYING3. If not see > - mask_store: currently just maskstore > - store_lanes: currently just vec_store_lanes > - mask_store_lanes: currently just vec_mask_store_lanes > + - mask_len_store_lanes: currently just vec_mask_len_store_lanes > - scatter_store: used for {mask_,mask_len_,}scatter_store > - len_store: currently just len_store > - mask_len_store: currently just mask_len_store > @@ -188,6 +190,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, > mask_load) > DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) > DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, > vec_mask_load_lanes, mask_load_lanes) > +DEF_INTERNAL_OPTAB_FN (MASK_LEN_LOAD_LANES, ECF_PURE, > + vec_mask_len_load_lanes, mask_load_lanes) > > DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) > DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, > @@ -208,6 +212,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, > mask_store) > DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes) > DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0, > vec_mask_store_lanes, mask_store_lanes) > +DEF_INTERNAL_OPTAB_FN (MASK_LEN_STORE_LANES, 0, > + vec_mask_len_store_lanes, mask_store_lanes) > > DEF_INTERNAL_OPTAB_FN (VCOND, ECF_CONST | ECF_NOTHROW, vcond, vec_cond) > DEF_INTERNAL_OPTAB_FN (VCONDU, ECF_CONST | ECF_NOTHROW, vcondu, vec_cond) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index 1ea1947b3b5..d4d7d6c53d4 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -82,6 +82,8 @@ OPTAB_CD(vec_load_lanes_optab, "vec_load_lanes$a$b") > OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b") > OPTAB_CD(vec_mask_load_lanes_optab, "vec_mask_load_lanes$a$b") > OPTAB_CD(vec_mask_store_lanes_optab, "vec_mask_store_lanes$a$b") > +OPTAB_CD(vec_mask_len_load_lanes_optab, "vec_mask_len_load_lanes$a$b") > +OPTAB_CD(vec_mask_len_store_lanes_optab, "vec_mask_len_store_lanes$a$b") > OPTAB_CD(vcond_optab, "vcond$a$b") > OPTAB_CD(vcondu_optab, "vcondu$a$b") > OPTAB_CD(vcondeq_optab, "vcondeq$a$b") > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)