On POWER8 in little endian mode, the lxvd2x and stxvd2x instructions we use
for loading and storing vectors do not perform a byte swap as part of their
operation.  This means we have to explicitly byte swap a vector before we
store to memory and byte swap it after we load it from memory.  These swaps
are explicit in our RTL and we decorate our load/stores with swaps as well.
A store operation would then look something like:

(insn 9 8 10 (set (reg:V4SI 126 [ bD.2787 ])
        (vec_select:V4SI (reg/v:V4SI 124 [ bD.2787 ])
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 -1
     (nil))

(insn 10 9 0 (set (mem/c:V4SI (plus:DI (reg/f:DI 116 virtual-stack-vars)
                (const_int 16 [0x10])) [1 MEM[(struct  *)&D.2792 + 16B]+0 S16 
A128])
        (vec_select:V4SI (reg:V4SI 126 [ bD.2787 ])
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 -1
     (nil))

There is similar code for loads.  Correctness wise, this is all fine and dandy,
but the swaps are inhibiting optimization of simple test cases.  For example,
the following test case passes in two vector arguments and returns a struct
that holds those vectors.

typedef struct
{
  __vector int vx0;
  __vector int vx1;
} vec_t;

vec_t
test_big_double (__vector int a, __vector int b)
{
  vec_t result;
  result.vx0 = a;
  result.vx1 = b;
  return result;
}

For our ELFv2 ABI, the vectors are passed in via registers vs34 and vs35
and the struct is returned in registers, also vs34 and vs35.  Ideally, this
should be one large nop and the only instruction generated should be the
blr return insn...and that is what we get on POWER8 BE when compiling with
-mabi=elfv2 (elfv1 doesn't return structs via regs).

However, on POWER8 LE, we generate the horrible and useless code:

        addi 8,1,-96
        li 10,32
        xxpermdi 34,34,34,2
        xxpermdi 35,35,35,2
        li 9,48
        stxvd2x 34,8,10
        stxvd2x 35,8,9
        lxvd2x 34,8,10
        lxvd2x 35,8,9
        xxpermdi 34,34,34,2
        xxpermdi 35,35,35,2
        blr

Looking at what happens on BE (-O2 -mcpu=power8 -mabi=elfv2), we start with
the following gimple just before expand:

test_big_double (__vector signed intD.1461 aD.2786, __vector signed intD.1461 
bD.2787)
{
  __vector signed intD.1461 a_2(D) = aD.2786;
  __vector signed intD.1461 b_3(D) = bD.2787;
  struct vec_tD.2785 D.2792;

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  # .MEM_4 = VDEF <.MEM_1(D)>
  MEM[(struct  *)&D.2792] = a_2(D);
  # .MEM_5 = VDEF <.MEM_4>
  MEM[(struct  *)&D.2792 + 16B] = b_3(D);
  # VUSE <.MEM_5>
  return D.2792;
;;    succ:       EXIT

}

This gets expanded to (only showing the rtl for the first vector field):

(insn 2 5 3 2 (set (reg/v:V4SI 123 [ aD.2713 ])
        (reg:V4SI 79 2 [ aD.2713 ])) "pr69493-6.c":9 1121 {*vsx_movv4si_64bit}
     (nil))
(insn 21 4 7 2 (set (reg:DI 127)
        (const_int 32 [0x20])) "pr69493-6.c":13 -1
     (nil))
(insn 7 21 22 2 (set (mem/c:V4SI (plus:DI (reg/f:DI 111 sfp)
                (reg:DI 127)) [1 MEM[(struct  *)&D.2719]+0 S16 A128])
        (reg/v:V4SI 123 [ aD.2713 ])) "pr69493-6.c":13 1121 {*vsx_movv4si_64bit}
     (nil))
(insn 23 8 9 2 (set (reg:DI 129)
        (const_int 32 [0x20])) "pr69493-6.c":13 -1
     (nil))
(insn 9 23 24 2 (set (reg:V4SI 125)
        (mem/c:V4SI (plus:DI (reg/f:DI 111 sfp)
                (reg:DI 129)) [1 D.2719+0 S16 A128])) "pr69493-6.c":13 1121 
{*vsx_movv4si_64bit}
     (nil))
(insn 11 10 12 2 (set (reg:V4SI 121 [ <retval> ])
        (reg:V4SI 125)) "pr69493-6.c":13 1121 {*vsx_movv4si_64bit}
     (nil))
(insn 16 12 17 2 (set (reg:V4SI 79 2)
        (reg:V4SI 121 [ <retval> ])) "pr69493-6.c":14 1121 {*vsx_movv4si_64bit}
     (nil))
(insn 18 17 19 2 (use (reg:V4SI 79 2)) "pr69493-6.c":14 -1
     (nil))

CSE1 comes along and replaces the MEM in insn 9 with pseudo 123 since they
are equivalent.  This makes the store in insn 7 dead and DSE deletes it later.
This leaves us with simple reg copies which all get cleaned up leaving us
with a simple return.  CSE sees the equivalence between pseudo 123 and the
MEM via the assignment in insn 7.  However, on LE, we get the following
expanded rtl (again, only showing the rtl for the first vector field):

(insn 2 5 3 2 (set (reg/v:V4SI 123 [ aD.2786 ])
        (reg:V4SI 79 2 [ aD.2786 ])) "pr69493-6.c":9 1047 {*vsx_movv4si_64bit}
     (nil))

(insn 7 4 25 2 (set (reg:V4SI 125 [ aD.2786 ])
        (vec_select:V4SI (reg/v:V4SI 123 [ aD.2786 ])
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 1218 {*vsx_xxpermdi4_le_v4si}
     (nil))
(insn 25 7 8 2 (set (reg:DI 131)
        (const_int 32 [0x20])) "pr69493-6.c":13 -1
     (nil))
(insn 8 25 9 2 (set (mem/c:V4SI (plus:DI (reg/f:DI 111 sfp)
                (reg:DI 131)) [1 MEM[(struct  *)&D.2792]+0 S16 A128])
        (vec_select:V4SI (reg:V4SI 125 [ aD.2786 ])
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 1230 {*vsx_stxvd2x4_le_v4si}
     (nil))


(insn 27 10 11 2 (set (reg:DI 133)
        (const_int 32 [0x20])) "pr69493-6.c":13 -1
     (nil))
(insn 11 27 12 2 (set (reg:V4SI 128)
        (vec_select:V4SI (mem/c:V4SI (plus:DI (reg/f:DI 111 sfp)
                    (reg:DI 133)) [1 D.2792+0 S16 A128])
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 1224 {*vsx_lxvd2x4_le_v4si}
     (nil))
(insn 12 11 28 2 (set (reg:V4SI 127)
        (vec_select:V4SI (reg:V4SI 128)
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                    (const_int 0 [0])
                    (const_int 1 [0x1])
                ]))) "pr69493-6.c":13 1218 {*vsx_xxpermdi4_le_v4si}
     (nil))
...

In this case, we have pseudo 125 is equivalent to a byte swapped pseudo 123,
pseudo 123 is equivalent to a byte swapped MEM and MEM is equivalent to a
byte swapped pseudo 125.  Or in pseudo code, we have the following equivalences:

equiv buckets after insn 7:
    *) 125 equiv SWAP(123),
equiv buckets after insn 8:
    *) 125 equiv SWAP(123),
    *) MEM equiv SWAP(125)

When we scan insn 11, we do not see MEM being equivalent to pseudo 123,
so we don't do the replacement.

However, if we look closer, we have MEM is equiv to SWAP(125), which is
the same as SWAP(SWAP(123)), which is 123, since SWAP is an involutory
operation!  CSE doesn't see this, because we don't have the right
equivalences recorded into the equivalence table.

My idea on how to "fix" this (which I'd like some comments on), is to check
when we insert an equivalence between A and B into the table to is check
whether B is an expression with an involutory operation OP(INNER_B) (where OP
is a byte swap in our case) and if so, then I also insert an equivalence
between INNER_B and OP(A).  If we look at how that handles the test case above,
we get:

equiv buckets after insn 7:
    *) 125 equiv SWAP(123),
    *) 123 equiv SWAP(125)

Now, when we scan insn 8, we add the normal equivalence MEM equiv SWAP(125),
and we see that SWAP(125) already is equivalent to 123, so we add MEM to
that equivalence bucket.  We then add the involutory equivalence of
125 equiv SWAP(MEM), which also already exists, leading us to the following
equivalence buckets after insn 8:

equiv buckets after insn 8:
    *) 125 equiv SWAP(123) equiv SWAP(MEM)
    *) 123 equiv SWAP(125) equiv MEM

Now, when we scan insn 11, we see that MEM is equivalent to pseudo 123
and we make the replacement, which leads us to optimize the function
similarly to how BE did, which results in just a "blr" insn being generated.

So it seems like I'm done! ...unfortunately no. :-(  If I change the vectors
from V4SI to V2DF like the test case below:

typedef struct
{
  __vector double vx0;
  __vector double vx1;
} vec_t;

vec_t
test_big_double (__vector double a, __vector double b)
{
  vec_t result;
  result.vx0 = a;
  result.vx1 = b;
  return result;
}

...then I again have problems.  In this case, it's due to exp_equiv_p() not
noticing when an expression is equivalent to itself, causing us to not
insert one of our new involutory equivalences into an preexisting equivalence
bucket, but into a new equivalence bucket, which causes us to not notice
the equivalence we need to make the replacement.  I debugged this down to
what I think is a bug in exp_equiv_p().  It uses the test:

  if (REG_IN_TABLE(i) != REG_TICK(i))
    return 0;

to try and determine whether a register/pseudo has just been set after we've
already processed an earlier reference to that register/pseudo.  If so, we
reject matching it by returning 0. It seems to me, that we should only reject
these equivalences when we've actually seen an earlier register/pseudo 
reference,
meaning when REG_IN_TABLE(i) != -1.  Changing the above test to:

  if (REG_IN_TABLE (i) != -1 && REG_IN_TABLE (i) != REG_TICK (i))
    return 0;

...fixes that problem and again, we can optimize this new test case to just
a "blr" insn.

There are still some cases I cannot optimize yet, let the above 2nd test case,
but instead if passing in vectors, I pass in a struct and assign its fields
one by one, rather than doing a struct assignment, like so:

typedef struct
{
  __vector double vx0;
  __vector double vx1;
} vec_t;

vec_t
test_big_double (vec_t a)
{
  vec_t result;
  result.vx0 = a.vx0;
  result.vx1 = a.vx1;
  return result;
}

Then I get the same bad code as the original test case.  Again, if we change
the type from double to int, I get optimal code.

Question for people, am I attacking this problem the correct way?  Do people
see a problem with me adding extra equivalences to the equivalence table?
If so, do you have a different suggestion on how to attack this problem?

I have attached the patch I am using below.  The "important" changes are
to cse.c.  The change to the rs6000 files are basically just a way to telling
cse.c that the vec_select it's seeing is a involutory byte swap.  I did
have to make one change to rs6000_rtx_costs() to modify the cost of the
byte swap, because a byte swap of even a reg was many many time higher
than a simple bare mem operand.

Thoughts?

Peter


Index: gcc/cse.c
===================================================================
--- gcc/cse.c   (revision 254891)
+++ gcc/cse.c   (working copy)
@@ -647,6 +647,30 @@ dump_class (struct table_elt *classp)
     }
 }
 
+DEBUG_FUNCTION void
+dump_all_classes (void)
+{
+  long i;
+  struct table_elt *p;
+  bool dumped_class_p = false;
+
+  for (i = FIRST_PSEUDO_REGISTER; i < max_reg_num (); i++)
+    if (REG_QTY (i) >= 0)
+      fprintf (stderr, "reg %ld: REG_IN_TABLE = %d, REG_TICK = %d, REG_QTY = 
%d\n",
+              i, REG_IN_TABLE (i), REG_TICK (i), REG_QTY (i) );
+  fprintf (stderr, "\n");
+
+  for (i = 0; i < HASH_SIZE; i++)
+    for (p = table[i]; p; p = p->next_same_hash)
+      {
+       fprintf (stderr, "---------------------------- [hash=%lu]:\n", i);
+       dump_class (p);
+       dumped_class_p = true;
+      }
+  if (dumped_class_p)
+    fprintf (stderr, "----------------------------\n");
+}
+
 /* Return an estimate of the cost of the registers used in an rtx.
    This is mostly the number of different REG expressions in the rtx;
    however for some exceptions like fixed registers we use a cost of
@@ -1448,6 +1472,16 @@ remove_pseudo_from_table (rtx x, unsigne
     remove_from_table (elt, hash);
 }
 
+/* Check whether rtx X corresponds to an involutory operation.
+   If X is involutory, then return its inner operand minus the
+   operation.  Otherwise, return NULL_RTX.  */
+
+static rtx
+check_involutory_operation (rtx x)
+{
+  return targetm.involutory_operation (x);
+}
+
 /* Look up X in the hash table and return its table element,
    or 0 if X is not in the table.
 
@@ -1714,8 +1748,33 @@ static struct table_elt *
 insert (rtx x, struct table_elt *classp, unsigned int hash,
        machine_mode mode)
 {
-  return insert_with_costs (x, classp, hash, mode,
-                           COST (x, mode), approx_reg_cost (x));
+  struct table_elt *ret = insert_with_costs (x, classp, hash, mode,
+                                            COST (x, mode),
+                                            approx_reg_cost (x));
+  if (!classp)
+    return ret;
+
+  /* We created an equivalence between X and CLASSP->EXP.  If CLASSP->EXP is
+     an expression (OP: INNER_EXP), where OP is an involutory operation
+     (ie, an operation that is its own inverse), then also create an 
equivalence
+     between INNER_EXP and (OP: X).  This allows us to catch equivalences of
+     the form: If (A) equiv (OP: B) and (B) equiv (OP: C) then (A) equiv (C).
+     See PR69493 for examples where this helps.  */
+  rtx opnd = check_involutory_operation (classp->exp);
+  if (opnd != NULL_RTX)
+    {
+      machine_mode opnd_mode = GET_MODE (opnd);
+      unsigned int opnd_hash = HASH (opnd, opnd_mode);
+      struct table_elt *opnd_classp = lookup (opnd, opnd_hash, opnd_mode);
+      rtx new_exp = copy_rtx (classp->exp);
+      replace_rtx (new_exp, opnd, x, true);
+      machine_mode new_mode = GET_MODE (new_exp);
+      unsigned int new_hash = HASH (new_exp, new_mode);
+      insert_with_costs (new_exp, opnd_classp, new_hash, new_mode,
+                        COST (new_exp, new_mode), approx_reg_cost (new_exp));
+    }
+
+  return ret;
 }
 
 
@@ -2640,7 +2699,7 @@ exp_equiv_p (const_rtx x, const_rtx y, i
            return 1;
 
          for (i = regno; i < endregno; i++)
-           if (REG_IN_TABLE (i) != REG_TICK (i))
+           if (REG_IN_TABLE (i) != -1 && REG_IN_TABLE (i) != REG_TICK (i))
              return 0;
 
          return 1;
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi     (revision 254891)
+++ gcc/doc/tm.texi     (working copy)
@@ -11831,6 +11831,13 @@ separated by spaces, which the caller wi
 
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_INVOLUTORY_OPERATION (const_rtx @var{op})
+
+If @var{op} is an involutory operation (ie, an operation that is its
+own inverse), then return the operand that is being operated on.
+Otherwise, return NULL_RTX.
+@end deftypefn
+
 @defmac TARGET_SUPPORTS_WIDE_INT
 
 On older ports, large integers are stored in @code{CONST_DOUBLE} rtl
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in  (revision 254891)
+++ gcc/doc/tm.texi.in  (working copy)
@@ -8021,6 +8021,8 @@ and the associated definitions of those
 
 @hook TARGET_OFFLOAD_OPTIONS
 
+@hook TARGET_INVOLUTORY_OPERATION
+
 @defmac TARGET_SUPPORTS_WIDE_INT
 
 On older ports, large integers are stored in @code{CONST_DOUBLE} rtl
Index: gcc/target.def
===================================================================
--- gcc/target.def      (revision 254891)
+++ gcc/target.def      (working copy)
@@ -4209,6 +4209,16 @@ loops containing function calls or branc
  const char *, (const rtx_insn *insn),
  default_invalid_within_doloop)
 
+/* Returns the operand of an involutory operation.  NULL_RTX otherwise.  */
+DEFHOOK
+(involutory_operation,
+ "\n\
+If @var{op} is an involutory operation (ie, an operation that is its\n\
+own inverse), then return the operand that is being operated on.\n\
+Otherwise, return NULL_RTX.",
+ rtx, (const_rtx op),
+ default_involutory_operation)
+
 /* Returns true for a legitimate combined insn.  */
 DEFHOOK
 (legitimate_combined_insn,
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c     (revision 254891)
+++ gcc/targhooks.c     (working copy)
@@ -657,6 +657,17 @@ default_invalid_within_doloop (const rtx
   return NULL;
 }
 
+/* Default implementation of TARGET_INVOLUTORY_OPERATION.
+   If OP is an involutory operation (ie, an operation that is its own
+   inverse), then return the operand that is being operated on.
+   Otherwise, return NULL_RTX.  */
+
+rtx
+default_involutory_operation (const_rtx op)
+{
+  return NULL_RTX;
+}
+
 /* Mapping of builtin functions to vectorized variants.  */
 
 tree
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h     (revision 254891)
+++ gcc/targhooks.h     (working copy)
@@ -86,6 +86,8 @@ extern bool default_has_ifunc_p (void);
 
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
+extern rtx default_involutory_operation (const_rtx x);
+
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
 extern tree default_builtin_md_vectorized_function (tree, tree, tree);
 
Index: gcc/config/rs6000/rs6000-p8swap.c
===================================================================
--- gcc/config/rs6000/rs6000-p8swap.c   (revision 254891)
+++ gcc/config/rs6000/rs6000-p8swap.c   (working copy)
@@ -293,36 +293,44 @@ insn_is_store_p (rtx insn)
   return 0;
 }
 
-/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
-   a permuting load, or a permuting store.  */
-static unsigned int
-insn_is_swap_p (rtx insn)
+/* If OP is a swapped operand, return the actual operand that is swapped.
+   Otherwise, return NULL_RTX.  */
+rtx
+rs6000_swap_opnd (const_rtx op)
 {
-  rtx body = PATTERN (insn);
-  if (GET_CODE (body) != SET)
-    return 0;
-  rtx rhs = SET_SRC (body);
-  if (GET_CODE (rhs) != VEC_SELECT)
-    return 0;
-  rtx parallel = XEXP (rhs, 1);
+
+  if (GET_CODE (op) != VEC_SELECT)
+    return NULL_RTX;
+  rtx parallel = XEXP (op, 1);
   if (GET_CODE (parallel) != PARALLEL)
-    return 0;
+    return NULL_RTX;
   unsigned int len = XVECLEN (parallel, 0);
   if (len != 2 && len != 4 && len != 8 && len != 16)
-    return 0;
+    return NULL_RTX;
   for (unsigned int i = 0; i < len / 2; ++i)
     {
       rtx op = XVECEXP (parallel, 0, i);
       if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
-       return 0;
+       return NULL_RTX;
     }
   for (unsigned int i = len / 2; i < len; ++i)
     {
       rtx op = XVECEXP (parallel, 0, i);
       if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
-       return 0;
+       return NULL_RTX;
     }
-  return 1;
+  return XEXP (op, 0);
+}
+
+/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
+   a permuting load, or a permuting store.  */
+static bool
+insn_is_swap_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  if (GET_CODE (body) != SET)
+    return false;
+  return rs6000_swap_opnd (SET_SRC (body)) != NULL_RTX;
 }
 
 /* Return TRUE if insn is a swap fed by a load from the constant pool.  */
Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   (revision 254891)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -169,6 +169,7 @@ extern rtx rs6000_address_for_altivec (r
 extern rtx rs6000_allocate_stack_temp (machine_mode, bool, bool);
 extern int rs6000_loop_align (rtx);
 extern void rs6000_split_logical (rtx [], enum rtx_code, bool, bool, bool);
+extern rtx rs6000_swap_opnd (const_rtx);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 254891)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -1975,6 +1975,9 @@ static const struct attribute_spec rs600
 
 #undef TARGET_STARTING_FRAME_OFFSET
 #define TARGET_STARTING_FRAME_OFFSET rs6000_starting_frame_offset
+
+#undef TARGET_INVOLUTORY_OPERATION
+#define TARGET_INVOLUTORY_OPERATION rs6000_swap_opnd
 
 
 /* Processor table.  */
@@ -34962,6 +34965,25 @@ rs6000_rtx_costs (rtx x, machine_mode mo
        }
       break;
 
+    case VEC_SELECT:
+{
+      rtx opnd = rs6000_swap_opnd (x);
+      if (opnd != NULL_RTX)
+       {
+           if (REG_P (opnd))
+             {
+               *total = 0;
+               return true;
+             }
+           else if (MEM_P (opnd))
+             {
+               *total = rtx_cost (opnd, mode, VEC_SELECT, 0, speed);
+               return true;
+             }
+       }
+}
+      break;
+
     default:
       break;
     }

Reply via email to