On Tue, Jun 22, 2021 at 10:43 AM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Mon, Jun 21, 2021 at 6:05 PM Richard Biener
> <richard.guent...@gmail.com> wrote:
> >
> > On Thu, Jun 17, 2021 at 8:29 AM liuhongt <hongtao....@intel.com> wrote:
> > >
> > > The patch remove those pro- and demotions when backend support direct
> > > optab.
> > >
> > > For i386: it enables vectorization for vpopcntb/vpopcntw and optimized
> > > for vpopcntq.
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR tree-optimization/97770
> > >         * tree-vect-patterns.c (vect_recog_popcount_pattern):
> > >         New.
> > >         (vect_recog_func vect_vect_recog_func_ptrs): Add new pattern.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR tree-optimization/97770
> > >         * gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail.
> > >         * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.
> > > ---
> > >  .../gcc.target/i386/avx512bitalg-pr97770-1.c  |  27 +++--
> > >  .../i386/avx512vpopcntdq-pr97770-1.c          |   9 +-
> > >  gcc/tree-vect-patterns.c                      | 110 ++++++++++++++++++
> > >  3 files changed, 127 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c 
> > > b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
> > > index c83a477045c..d1beec4cdb4 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
> > > @@ -1,19 +1,18 @@
> > >  /* PR target/97770 */
> > >  /* { dg-do compile } */
> > > -/* { dg-options "-O2 -mavx512bitalg -mavx512vl 
> > > -mprefer-vector-width=512" } */
> > > -/* Add xfail since no IFN for QI/HImode popcount */
> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1 {xfail *-*-*} } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1 {xfail *-*-*} } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1 {xfail *-*-*} } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1 {xfail *-*-*} } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1 {xfail *-*-*} } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1 {xfail *-*-*} } } */
> > > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } 
> > > */
> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1  } } */
> > >
> > >  #include <immintrin.h>
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountb_128 (char * __restrict dest, char* src)
> > > +popcountb_128 (unsigned char * __restrict dest, unsigned char* src)
> > >  {
> > >    for (int i = 0; i != 16; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > @@ -21,7 +20,7 @@ popcountb_128 (char * __restrict dest, char* src)
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountw_128 (short* __restrict dest, short* src)
> > > +popcountw_128 (unsigned short* __restrict dest, unsigned short* src)
> > >  {
> > >    for (int i = 0; i != 8; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > @@ -29,7 +28,7 @@ popcountw_128 (short* __restrict dest, short* src)
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountb_256 (char * __restrict dest, char* src)
> > > +popcountb_256 (unsigned char * __restrict dest, unsigned char* src)
> > >  {
> > >    for (int i = 0; i != 32; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > @@ -37,7 +36,7 @@ popcountb_256 (char * __restrict dest, char* src)
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountw_256 (short* __restrict dest, short* src)
> > > +popcountw_256 (unsigned short* __restrict dest, unsigned short* src)
> > >  {
> > >    for (int i = 0; i != 16; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > @@ -45,7 +44,7 @@ popcountw_256 (short* __restrict dest, short* src)
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountb_512 (char * __restrict dest, char* src)
> > > +popcountb_512 (unsigned char * __restrict dest, unsigned char* src)
> > >  {
> > >    for (int i = 0; i != 64; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > @@ -53,7 +52,7 @@ popcountb_512 (char * __restrict dest, char* src)
> > >
> > >  void
> > >  __attribute__ ((noipa, optimize("-O3")))
> > > -popcountw_512 (short* __restrict dest, short* src)
> > > +popcountw_512 (unsigned short* __restrict dest, unsigned short* src)
> > >  {
> > >    for (int i = 0; i != 32; i++)
> > >      dest[i] = __builtin_popcount (src[i]);
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c 
> > > b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
> > > index 63bb00d9b4a..dedd2e4c3d6 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
> > > @@ -1,13 +1,12 @@
> > >  /* PR target/97770 */
> > >  /* { dg-do compile } */
> > > -/* { dg-options "-O2 -mavx512vpopcntdq -mavx512vl 
> > > -mprefer-vector-width=512" } */
> > > +/* { dg-options "-O2 -march=icelake-server -mprefer-vector-width=512" } 
> > > */
> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1 } } */
> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1 } } */
> > >  /* { dg-final { scan-assembler-times "vpopcntd\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1 } } */
> > > -/* Add xfail since current vectorizor cannot generate expected code for 
> > > DImode popcount */
> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1 { xfail *-*-* } } } */
> > > -/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1 { xfail *-*-* } } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*xmm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*ymm" 
> > > 1  } } */
> > > +/* { dg-final { scan-assembler-times "vpopcntq\[ \\t\]+\[^\\n\\r\]*zmm" 
> > > 1  } } */
> > >  #ifndef AVX512VPOPCNTQ_H_INCLUDED
> > >  #define AVX512VPOPCNTQ_H_INCLUDED
> > >
> > > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> > > index 177d44ebb5e..5c80800efbb 100644
> > > --- a/gcc/tree-vect-patterns.c
> > > +++ b/gcc/tree-vect-patterns.c
> > > @@ -1292,6 +1292,115 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, 
> > > stmt_vec_info last_stmt_info,
> > >                                       "vect_recog_widen_minus_pattern");
> > >  }
> > >
> > > +/* Function vect_recog_popcount_pattern
> > > +
> > > +   Try to find the following pattern:
> > > +
> > > +   UTYPE1 A;
> > > +   TYPE1 B;
> > > +   UTYPE2 temp_in;
> > > +   TYPE3 temp_out;
> > > +   temp_in = (TYPE2)A;
> > > +
> > > +   temp_out = __builtin_popcount{,l,ll} (temp_in);
> > > +   B = (TYPE1) temp_out;
> > > +
> > > +   TYPE2 may or may not be equal to TYPE3.
> > > +   i.e. TYPE2 is equal to TYPE3 for __builtin_popcount
> > > +   i.e. TYPE2 is not equal to TYPE3 for __builtin_popcountll
> > > +
> > > +   Input:
> > > +
> > > +   * STMT_VINFO: The stmt from which the pattern search begins.
> > > +   here it starts with B = (TYPE1) temp_out;
> > > +
> > > +   Output:
> > > +
> > > +   * TYPE_OUT: The vector type of the output of this pattern.
> > > +
> > > +   * Return value: A new stmt that will be used to replace the sequence 
> > > of
> > > +   stmts that constitute the pattern. In this case it will be:
> > > +   B = .POPCOUNT (A);
> > > +*/
> > > +
> > > +static gimple *
> > > +vect_recog_popcount_pattern (vec_info *vinfo,
> > > +                            stmt_vec_info stmt_vinfo, tree *type_out)
> > > +{
> > > +  gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
> > > +  gimple *popcount_stmt, *pattern_stmt;
> > > +  tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
> > > +  auto_vec<tree> vargs;
> > > +
> > > +  /* Find B = (TYPE1) temp_out. */
> > > +  if (!last_stmt)
> > > +    return NULL;
> > > +  tree_code code = gimple_assign_rhs_code (last_stmt);
> > > +  if (!CONVERT_EXPR_CODE_P (code))
> > > +    return NULL;
> > > +
> > > +  lhs_oprnd = gimple_assign_lhs (last_stmt);
> > > +  lhs_type = TREE_TYPE (lhs_oprnd);
> > > +  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
> > > +    return NULL;
> >
> > INTEGRAL_TYPE_P
> >
> Changed.
> > > +  rhs_oprnd = gimple_assign_rhs1 (last_stmt);
> > > +  if (TREE_CODE (rhs_oprnd) != SSA_NAME
> > > +      || !has_single_use (rhs_oprnd))
> > > +    return NULL;
> > > +  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
> > > +
> > > +  /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
> > > +  if (!is_gimple_call (popcount_stmt)
> > > +      || !gimple_call_lhs (popcount_stmt))
> >
> > Since you're arriving here via use-def chain the LHS will
> > never be NULL.
> >
Forgot to mention this part is also changed.
> > > +    return NULL;
> > > +  switch (gimple_call_combined_fn (popcount_stmt))
> > > +    {
> > > +    CASE_CFN_POPCOUNT:
> > > +      break;
> > > +    default:
> > > +      return NULL;
> > > +    }
> > > +
> >
> > for safety:
> >
> >     if (gimple_call_num_args (popcount_stmt) != 1)
> >       return NULL;
> >
> Changed.
> > > +  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
> > > +  vect_unpromoted_value unprom_diff;
> > > +  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
> > > +                                                   &unprom_diff);
> > > +
> > > +  if (!rhs_origin)
> > > +    return NULL;
> > > +
> > > +  /* Input and outout of .POPCOUNT should be same-precision integer.
> > > +     Also A should be unsigned or same presion as temp_in,
> > > +     otherwise there would be sign_extend from A to temp_in.  */
> > > +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
> > > +      || !(TYPE_UNSIGNED (unprom_diff.type)
> > > +          || (TYPE_PRECISION (unprom_diff.type)
> > > +              == TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
> >
> > Note I find a if (A || !(B || C)) hard to read, please write if (A ||
> > (!B && !C)) instead.
> >
> Changed.
> > OK otherwise.
> >
> > Thanks,
> > Richard.
> >
> > > +    return NULL;
> > > +  vargs.safe_push (unprom_diff.op);
> > > +
> > > +  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
> > > +  vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
> > > +  /* Do it only the backend existed popcount<vector_mode>2.  */
> > > +  if (!direct_internal_fn_supported_p (IFN_POPCOUNT,
> > > +                                      vec_type,
> > > +                                      OPTIMIZE_FOR_SPEED))
> > > +    return NULL;
> > > +
> > > +  /* Create B = .POPCOUNT (A).  */
> > > +  new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> > > +  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
> > > +  gimple_call_set_lhs (pattern_stmt, new_var);
> > > +  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> > > +  *type_out = vec_type;
> > > +
> > > +  if (dump_enabled_p ())
> > > +    dump_printf_loc (MSG_NOTE, vect_location,
> > > +                    "created pattern stmt: %G", pattern_stmt);
> > > +  return pattern_stmt;
> > > +}
> > > +
> > >  /* Function vect_recog_pow_pattern
> > >
> > >     Try to find the following pattern:
> > > @@ -5283,6 +5392,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] 
> > > = {
> > >    { vect_recog_sad_pattern, "sad" },
> > >    { vect_recog_widen_sum_pattern, "widen_sum" },
> > >    { vect_recog_pow_pattern, "pow" },
> > > +  { vect_recog_popcount_pattern, "popcount" },
> > >    { vect_recog_widen_shift_pattern, "widen_shift" },
> > >    { vect_recog_rotate_pattern, "rotate" },
> > >    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
> > > --
> > > 2.18.1
> > >
>
> Thanks for the review, here is the patch I'm checking in.
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao

Reply via email to