Re: [PATCH 1/2]middle-end Teach CSE to be able to do vector extracts.

Tamar Christina via Gcc-patches Wed, 08 Sep 2021 05:35:15 -0700

Hi Jeff & Richard,

> If you can turn that example into a test, even if it's just in the
> aarch64 directory, that would be helpful


The second patch 2/2 has various tests for this as the cost model had to
be made more accurate for it to work.

> 
> As mentioned in the 2/2 thread, I think we should use subregs for
> the case where they're canonical.  It'd probably be worth adding a
> simplify-rtx.c helper to extract one element from a vector, e.g.:
> 
>   rtx simplify_gen_vec_select (rtx op, unsigned int index);
> 
> so that this is easier to do.
> 
> Does making the loop above per-element mean that, for 128-bit Advanced
> SIMD, the optimisation “only” kicks in for 64-bit element sizes?
> Perhaps for other element sizes we could do “top” and “bottom” halves.
> (There's obviously no need to do that as part of this work, was just
> wondering.)
> 

It should handle extraction of any element size, so it's able to use a value
in any abitrary location.  CSE already handles low/hi re-use optimally. So e.g.

#include <arm_neon.h>

extern int16x8_t bar (int16x8_t, int16x8_t);

int16x8_t foo ()
{
    int16_t s[4] = {1,2,3,4};
    int16_t d[8] = {1,2,3,4,5,6,7,8};

    int16x4_t r1 = vld1_s16 (s);
    int16x8_t r2 = vcombine_s16 (r1, r1);
    int16x8_t r3 = vld1q_s16 (d);
    return bar (r2, r3);
}

but our cost model is currently blocking it because we never costed vec_consts.
Without the 2/2 patch we generate:

foo:
        stp     x29, x30, [sp, -48]!
        adrp    x0, .LC0
        mov     x29, sp
        ldr     q1, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q0, [x0, #:lo12:.LC1]
        adrp    x0, .LC2
        str     q1, [sp, 32]
        ldr     d2, [x0, #:lo12:.LC2]
        str     d2, [sp, 24]
        bl      bar
        ldp     x29, x30, [sp], 48
        ret
.LC0:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  5
        .hword  6
        .hword  7
        .hword  8
.LC1:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  1
        .hword  2
        .hword  3
        .hword  4

but with the 2/2 patch:

foo:
        stp     x29, x30, [sp, -48]!
        adrp    x0, .LC0
        mov     x29, sp
        ldr     d2, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q1, [x0, #:lo12:.LC1]
        str     d2, [sp, 24]
        dup     d0, v2.d[0]
        str     q1, [sp, 32]
        ins     v0.d[1], v2.d[0]
        bl      bar
        ldp     x29, x30, [sp], 48
        ret
.LC1:
        .hword  1
        .hword  2
        .hword  3
        .hword  4
        .hword  5
        .hword  6
        .hword  7
        .hword  8

It's not entirely optimal of course, but is step forward. I think when we fix
the vld's this should then become optimal as current the MEMs are causing it to
not re-use those values.

> >        else
> >     sets[n_sets++].rtl = x;
> >      }
> > @@ -4513,7 +4533,14 @@ cse_insn (rtx_insn *insn)
> >    struct set *sets = (struct set *) 0;
> >  
> >    if (GET_CODE (x) == SET)
> > -    sets = XALLOCA (struct set);
> > +    {
> > +      /* For CONST_VECTOR we wants to be able to CSE the vector itself 
> > along with
> > +    elements inside the vector if the target says it's cheap.  */
> > +      if (GET_CODE (SET_SRC (x)) == CONST_VECTOR)
> > +   sets = XALLOCAVEC (struct set, const_vector_encoded_nelts (SET_SRC (x)) 
> > + 1);
> > +      else
> > +   sets = XALLOCA (struct set);
> > +    }
> >    else if (GET_CODE (x) == PARALLEL)
> >      sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
> 
> I think this would be easier if “sets” was first converted to an
> auto_vec, say auto_vec<struct set, 8>.  We then wouldn't need to
> predict in advance how many elements are needed.
> 

Done.

> > @@ -4997,6 +5024,26 @@ cse_insn (rtx_insn *insn)
> >       src_related_is_const_anchor = src_related != NULL_RTX;
> >     }
> >  
> > +      /* Try to re-materialize a vec_dup with an existing constant.   */
> > +      if (GET_CODE (src) == CONST_VECTOR
> > +     && const_vector_encoded_nelts (src) == 1)
> > +   {
> > +      rtx const_rtx = CONST_VECTOR_ELT (src, 0);
> 
> Would be simpler as:
> 
>   rtx src_elt;
>   if (const_vec_duplicate_p (src, &src_elt))
> 
> I think we should also check !src_eqv_here, or perhaps:
> 
>   (!src_eqv_here || CONSTANT_P (src_eqv_here))
> 
> so that we don't override any existing reg notes, which could have more
> chance of succeeding.
> 

Done.

> > +      machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
> > +      struct table_elt *related_elt
> > +           = lookup (const_rtx, HASH (const_rtx, const_mode), const_mode);
> > +      if (related_elt)
> > +       {
> > +         for (related_elt = related_elt->first_same_value;
> > +              related_elt; related_elt = related_elt->next_same_value)
> > +           if (REG_P (related_elt->exp))
> > +             {
> > +               src_eqv_here
> > +                   = gen_rtx_VEC_DUPLICATE (GET_MODE (src),
> > +                                            related_elt->exp);
> > +             }
> 
> Other similar loops seem to break after the first match, instead of
> picking the last match.
> 

Done.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * cse.c (add_to_set): New.
        (find_sets_in_insn): Register constants in sets.
        (canonicalize_insn): Use auto_vec instead.
        (cse_insn): Try materializing using vec_dup.
        * rtl.h (simplify_context::simplify_gen_vec_select,
        simplify_gen_vec_select): New.
        * simplify-rtx.c (simplify_context::simplify_gen_vec_select): New.

> Thanks,
> Richard
> 
> > +       }
> > +   }
> >  
> >        if (src == src_folded)
> >     src_folded = 0;

--

diff --git a/gcc/cse.c b/gcc/cse.c
index 330c1e90ce05b8f95b58f24576ec93e10ec55d89..216a6b9151c178da2a0c3e092f4c7c66ac4b6d2c 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "regs.h"
 #include "function-abi.h"
 #include "rtlanal.h"
+#include "expr.h"
 
 /* The basic idea of common subexpression elimination is to go
    through the code, keeping a record of expressions that would
@@ -4248,13 +4249,23 @@ try_back_substitute_reg (rtx set, rtx_insn *insn)
     }
 }
 
+
+/* Add an entry containing RTL X into SETS.  */
+static inline void
+add_to_set (vec<struct set> *sets, rtx x)
+{
+  struct set entry = {};
+  entry.rtl = x;
+  sets->safe_push (entry);
+}
+
 /* Record all the SETs in this instruction into SETS_PTR,
    and return the number of recorded sets.  */
 static int
-find_sets_in_insn (rtx_insn *insn, struct set **psets)
+find_sets_in_insn (rtx_insn *insn, vec<struct set> *psets)
 {
-  struct set *sets = *psets;
-  int n_sets = 0;
+  vec<struct set> sets = *psets;
+
   rtx x = PATTERN (insn);
 
   if (GET_CODE (x) == SET)
@@ -4274,8 +4285,24 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets)
 	 someplace else, so it isn't worth cse'ing.  */
       else if (GET_CODE (SET_SRC (x)) == CALL)
 	;
+      else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR
+	       && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != MODE_VECTOR_BOOL)
+	{
+	  /* First register the vector itself.  */
+	  add_to_set (psets, x);
+	  rtx src = SET_SRC (x);
+	  /* Go over the constants of the CONST_VECTOR in forward order, to
+	     put them in the same order in the SETS array.  */
+	  for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++)
+	    {
+	      /* These are templates and don't actually get emitted but are
+		 used to tell CSE how to get to a particular constant.  */
+	      rtx y = simplify_gen_vec_select (SET_DEST (x), i);
+	      add_to_set (psets, gen_rtx_SET (y, CONST_VECTOR_ELT (src, i)));
+	    }
+	}
       else
-	sets[n_sets++].rtl = x;
+	add_to_set (psets, x);
     }
   else if (GET_CODE (x) == PARALLEL)
     {
@@ -4296,12 +4323,12 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets)
 	      else if (GET_CODE (SET_SRC (y)) == CALL)
 		;
 	      else
-		sets[n_sets++].rtl = y;
+		add_to_set (psets, y);
 	    }
 	}
     }
 
-  return n_sets;
+  return sets.length ();
 }
 
 /* Subroutine of canonicalize_insn.  X is an ASM_OPERANDS in INSN.  */
@@ -4349,9 +4376,10 @@ canon_asm_operands (rtx x, rtx_insn *insn)
    see canon_reg.  */
 
 static void
-canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
+canonicalize_insn (rtx_insn *insn, vec<struct set> *psets)
 {
-  struct set *sets = *psets;
+  vec<struct set> sets = *psets;
+  int n_sets = sets.length ();
   rtx tem;
   rtx x = PATTERN (insn);
   int i;
@@ -4510,13 +4538,6 @@ cse_insn (rtx_insn *insn)
   int src_eqv_in_memory = 0;
   unsigned src_eqv_hash = 0;
 
-  struct set *sets = (struct set *) 0;
-
-  if (GET_CODE (x) == SET)
-    sets = XALLOCA (struct set);
-  else if (GET_CODE (x) == PARALLEL)
-    sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
-
   this_insn = insn;
 
   /* Find all regs explicitly clobbered in this insn,
@@ -4525,10 +4546,11 @@ cse_insn (rtx_insn *insn)
   invalidate_from_sets_and_clobbers (insn);
 
   /* Record all the SETs in this instruction.  */
-  n_sets = find_sets_in_insn (insn, &sets);
+  auto_vec<struct set, 8> sets;
+  n_sets = find_sets_in_insn (insn, (vec<struct set>*)&sets);
 
   /* Substitute the canonical register where possible.  */
-  canonicalize_insn (insn, &sets, n_sets);
+  canonicalize_insn (insn, (vec<struct set>*)&sets);
 
   /* If this insn has a REG_EQUAL note, store the equivalent value in SRC_EQV,
      if different, or if the DEST is a STRICT_LOW_PART/ZERO_EXTRACT.  The
@@ -4997,6 +5019,27 @@ cse_insn (rtx_insn *insn)
 	  src_related_is_const_anchor = src_related != NULL_RTX;
 	}
 
+      /* Try to re-materialize a vec_dup with an existing constant.   */
+      rtx src_elt;
+      if ((!src_eqv_here || CONSTANT_P (src_eqv_here))
+	  && const_vec_duplicate_p (src, &src_elt))
+	{
+	   machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
+	   struct table_elt *related_elt
+		= lookup (src_elt, HASH (src_elt, const_mode), const_mode);
+	   if (related_elt)
+	    {
+	      for (related_elt = related_elt->first_same_value;
+		   related_elt; related_elt = related_elt->next_same_value)
+		if (REG_P (related_elt->exp))
+		  {
+		    src_eqv_here
+			= gen_rtx_VEC_DUPLICATE (GET_MODE (src),
+						 related_elt->exp);
+		    break;
+		  }
+	    }
+	}
 
       if (src == src_folded)
 	src_folded = 0;
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 5473cc9f2ddf1863191a3e2b5914ae89598e53b4..6a6de1cf11d0bf5dd38f5ee37901310d653ce722 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -3425,6 +3425,7 @@ public:
 			    rtx, rtx, rtx);
   rtx simplify_gen_relational (rtx_code, machine_mode, machine_mode, rtx, rtx);
   rtx simplify_gen_subreg (machine_mode, rtx, machine_mode, poly_uint64);
+  rtx simplify_gen_vec_select (rtx, unsigned int);
 
   /* Tracks the level of MEM nesting for the value being simplified:
      0 means the value is not in a MEM, >0 means it is.  This is needed
@@ -3526,6 +3527,12 @@ simplify_gen_subreg (machine_mode outermode, rtx op, machine_mode innermode,
 						  innermode, byte);
 }
 
+inline rtx
+simplify_gen_vec_select (rtx op, unsigned int index)
+{
+  return simplify_context ().simplify_gen_vec_select (op, index);
+}
+
 inline rtx
 lowpart_subreg (machine_mode outermode, rtx op, machine_mode innermode)
 {
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index ebad5cb5a79cf902d4e0e8bc7cdf0e468da573e8..5027af3a39595e1843aecb076cc1322abf034301 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -7468,6 +7468,24 @@ simplify_context::lowpart_subreg (machine_mode outer_mode, rtx expr,
 			      subreg_lowpart_offset (outer_mode, inner_mode));
 }
 
+/* Generate RTX to select element at INDEX out of vector OP.  */
+
+rtx simplify_context::simplify_gen_vec_select (rtx op, unsigned int index)
+{
+
+  if (!VECTOR_MODE_P (GET_MODE (op)))
+    return NULL_RTX;
+
+  machine_mode imode = GET_MODE_INNER (GET_MODE (op));
+
+  if (index == 0)
+    return lowpart_subreg (imode, op, GET_MODE (op));
+
+  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (index)));
+  return gen_rtx_VEC_SELECT (imode, op, tmp);
+}
+
+
 /* Simplify X, an rtx expression.
 
    Return the simplified expression or NULL if no simplifications

Re: [PATCH 1/2]middle-end Teach CSE to be able to do vector extracts.

Reply via email to