Hi Segher,

Thanks for the review!

>>      * config/rs6000/rs6000-p8swap.c (insn_rtx_pair_t): New type.
> 
> Please don't do that.  The "first" and "second" are completely
> meaningless.  Also,  keeping it separate arrays can very well result in
> better machine code, and certainly makes easier to read source code.

OK, use separate arrays instead.  Here the first is the AND rtx_insn
while the second is its fully-expanded rtx, I thought it's better to
bundle them together before, make_pair is an easy way for that.

> 
>> +static bool
>> +find_alignment_op (rtx_insn *insn, rtx base_reg,
>> +               vec<insn_rtx_pair_t> *and_insn_vec)
> 
> Don't name vecs "_vec" (just keep it "and_insn" here, or sometimes
> and_insns is clearer).

Done, also for those ommitted below.  Thanks!

> 
>> -  rtx and_operation = 0;
>> +  rtx and_operation = NULL_RTX;
> 
> Don't change code randomly (to something arguably worse, even).

Done.  I may think too much and thought NULL_RTX may be preferred
since it has the potential to be changed by defining it as nullptr
in the current C++11 context.

Bootstrapped/regtested on powerpc64le-linux-gnu P8 again.

Does the attached vesion look better?

BR,
Kewen
-----
gcc/ChangeLog:

        * config/rs6000/rs6000-p8swap.c (find_alignment_op): Adjust to
        support multiple defintions which are all AND operations with
        the mask -16B.
        (recombine_lvx_pattern): Adjust to handle multiple AND operations
        from find_alignment_op.
        (recombine_stvx_pattern): Likewise.

gcc/testsuite/ChangeLog:

        * gcc.target/powerpc/pr97019.c: New test.
diff --git a/gcc/config/rs6000/rs6000-p8swap.c 
b/gcc/config/rs6000/rs6000-p8swap.c
index 3d5dc7d8aae..be863aa479e 100644
--- a/gcc/config/rs6000/rs6000-p8swap.c
+++ b/gcc/config/rs6000/rs6000-p8swap.c
@@ -2095,11 +2095,15 @@ alignment_mask (rtx_insn *insn)
   return alignment_with_canonical_addr (SET_SRC (body));
 }
 
-/* Given INSN that's a load or store based at BASE_REG, look for a
-   feeding computation that aligns its address on a 16-byte boundary.
-   Return the rtx and its containing AND_INSN.  */
-static rtx
-find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn)
+/* Given INSN that's a load or store based at BASE_REG, check if
+   all of its feeding computations align its address on a 16-byte
+   boundary.  If so, return true and add all definition insns into
+   AND_INSNS and their corresponding fully-expanded rtxes for the
+   masking operations into AND_OPS.  */
+
+static bool
+find_alignment_op (rtx_insn *insn, rtx base_reg, vec<rtx_insn *> *and_insns,
+                  vec<rtx> *and_ops)
 {
   df_ref base_use;
   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
@@ -2111,19 +2115,28 @@ find_alignment_op (rtx_insn *insn, rtx base_reg, 
rtx_insn **and_insn)
        continue;
 
       struct df_link *base_def_link = DF_REF_CHAIN (base_use);
-      if (!base_def_link || base_def_link->next)
-       break;
+      if (!base_def_link)
+       return false;
 
-      /* With stack-protector code enabled, and possibly in other
-        circumstances, there may not be an associated insn for 
-        the def.  */
-      if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
-       break;
+      while (base_def_link)
+       {
+         /* With stack-protector code enabled, and possibly in other
+            circumstances, there may not be an associated insn for
+            the def.  */
+         if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
+           return false;
 
-      *and_insn = DF_REF_INSN (base_def_link->ref);
-      and_operation = alignment_mask (*and_insn);
-      if (and_operation != 0)
-       break;
+         rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
+         and_operation = alignment_mask (and_insn);
+
+         /* Stop if we find any one which doesn't align.  */
+         if (!and_operation)
+           return false;
+
+         and_insns->safe_push (and_insn);
+         and_ops->safe_push (and_operation);
+         base_def_link = base_def_link->next;
+       }
     }
 
   return and_operation;
@@ -2143,11 +2156,14 @@ recombine_lvx_pattern (rtx_insn *insn, del_info 
*to_delete)
   rtx mem = XEXP (SET_SRC (body), 0);
   rtx base_reg = XEXP (mem, 0);
 
-  rtx_insn *and_insn;
-  rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
+  auto_vec<rtx_insn *> and_insns;
+  auto_vec<rtx> and_ops;
+  bool is_any_def_and
+    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
 
-  if (and_operation != 0)
+  if (is_any_def_and)
     {
+      gcc_assert (and_insns.length () == and_ops.length ());
       df_ref def;
       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
       FOR_EACH_INSN_INFO_DEF (def, insn_info)
@@ -2168,25 +2184,35 @@ recombine_lvx_pattern (rtx_insn *insn, del_info 
*to_delete)
          to_delete[INSN_UID (swap_insn)].replace = true;
          to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
 
-         /* However, first we must be sure that we make the
-            base register from the AND operation available
-            in case the register has been overwritten.  Copy
-            the base register to a new pseudo and use that
-            as the base register of the AND operation in
-            the new LVX instruction.  */
-         rtx and_base = XEXP (and_operation, 0);
-         rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
-         rtx copy = gen_rtx_SET (new_reg, and_base);
-         rtx_insn *new_insn = emit_insn_after (copy, and_insn);
-         set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
-         df_insn_rescan (new_insn);
-
-         XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
-                                      XEXP (and_operation, 1));
+         rtx new_reg = 0;
+         rtx and_mask = 0;
+         for (unsigned i = 0; i < and_insns.length (); ++i)
+           {
+             /* However, first we must be sure that we make the
+                base register from the AND operation available
+                in case the register has been overwritten.  Copy
+                the base register to a new pseudo and use that
+                as the base register of the AND operation in
+                the new LVX instruction.  */
+             rtx_insn *and_insn = and_insns[i];
+             rtx and_op = and_ops[i];
+             rtx and_base = XEXP (and_op, 0);
+             if (!new_reg)
+               {
+                 new_reg = gen_reg_rtx (GET_MODE (and_base));
+                 and_mask = XEXP (and_op, 1);
+               }
+             rtx copy = gen_rtx_SET (new_reg, and_base);
+             rtx_insn *new_insn = emit_insn_after (copy, and_insn);
+             set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
+             df_insn_rescan (new_insn);
+           }
+
+         XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
          SET_SRC (body) = mem;
          INSN_CODE (insn) = -1; /* Force re-recognition.  */
          df_insn_rescan (insn);
-                 
+
          if (dump_file)
            fprintf (dump_file, "lvx opportunity found at %d\n",
                     INSN_UID (insn));
@@ -2205,11 +2231,14 @@ recombine_stvx_pattern (rtx_insn *insn, del_info 
*to_delete)
   rtx mem = SET_DEST (body);
   rtx base_reg = XEXP (mem, 0);
 
-  rtx_insn *and_insn;
-  rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
+  auto_vec<rtx_insn *> and_insns;
+  auto_vec<rtx> and_ops;
+  bool is_any_def_and
+    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
 
-  if (and_operation != 0)
+  if (is_any_def_and)
     {
+      gcc_assert (and_insns.length () == and_ops.length ());
       rtx src_reg = XEXP (SET_SRC (body), 0);
       df_ref src_use;
       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
@@ -2234,25 +2263,35 @@ recombine_stvx_pattern (rtx_insn *insn, del_info 
*to_delete)
          to_delete[INSN_UID (swap_insn)].replace = true;
          to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
 
-         /* However, first we must be sure that we make the
-            base register from the AND operation available
-            in case the register has been overwritten.  Copy
-            the base register to a new pseudo and use that
-            as the base register of the AND operation in
-            the new STVX instruction.  */
-         rtx and_base = XEXP (and_operation, 0);
-         rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
-         rtx copy = gen_rtx_SET (new_reg, and_base);
-         rtx_insn *new_insn = emit_insn_after (copy, and_insn);
-         set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
-         df_insn_rescan (new_insn);
-
-         XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
-                                      XEXP (and_operation, 1));
+         rtx new_reg = 0;
+         rtx and_mask = 0;
+         for (unsigned i = 0; i < and_insns.length (); ++i)
+           {
+             /* However, first we must be sure that we make the
+                base register from the AND operation available
+                in case the register has been overwritten.  Copy
+                the base register to a new pseudo and use that
+                as the base register of the AND operation in
+                the new STVX instruction.  */
+             rtx_insn *and_insn = and_insns[i];
+             rtx and_op = and_ops[i];
+             rtx and_base = XEXP (and_op, 0);
+             if (!new_reg)
+               {
+                 new_reg = gen_reg_rtx (GET_MODE (and_base));
+                 and_mask = XEXP (and_op, 1);
+               }
+             rtx copy = gen_rtx_SET (new_reg, and_base);
+             rtx_insn *new_insn = emit_insn_after (copy, and_insn);
+             set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
+             df_insn_rescan (new_insn);
+           }
+
+         XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
          SET_SRC (body) = src_reg;
          INSN_CODE (insn) = -1; /* Force re-recognition.  */
          df_insn_rescan (insn);
-                 
+
          if (dump_file)
            fprintf (dump_file, "stvx opportunity found at %d\n",
                     INSN_UID (insn));
diff --git a/gcc/testsuite/gcc.target/powerpc/pr97019.c 
b/gcc/testsuite/gcc.target/powerpc/pr97019.c
new file mode 100644
index 00000000000..cb4cba4a284
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr97019.c
@@ -0,0 +1,82 @@
+/* This issue can only exist on little-endian P8 targets, since
+   the built-in functions vec_ld/vec_st will use lxvd2x/stxvd2x
+   (P8 big-endian) or lxv/stxv (P9 and later).  */
+/* { dg-do compile { target { powerpc_p8vector_ok && le } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test there are no useless instructions "rldicr x,y,0,59"
+   to align the addresses for lvx/stvx.  */
+
+extern int a, b, c;
+extern vector unsigned long long ev5, ev6, ev7, ev8;
+extern int dummy (vector unsigned long long);
+
+int test_vec_ld(unsigned char *pe) {
+
+  vector unsigned long long v1, v2, v3, v4, v9;
+  vector unsigned long long v5 = ev5;
+  vector unsigned long long v6 = ev6;
+  vector unsigned long long v7 = ev7;
+  vector unsigned long long v8 = ev8;
+
+  unsigned char *e = pe;
+
+  do {
+    if (a) {
+      v1 = __builtin_vec_ld(16, (unsigned long long *)e);
+      v2 = __builtin_vec_ld(32, (unsigned long long *)e);
+      v3 = __builtin_vec_ld(48, (unsigned long long *)e);
+      e = e + 8;
+      for (int i = 0; i < a; i++) {
+        v4 = v5;
+        v5 = __builtin_crypto_vpmsumd(v1, v6);
+        v6 = __builtin_crypto_vpmsumd(v2, v7);
+        v7 = __builtin_crypto_vpmsumd(v3, v8);
+        e = e + 8;
+      }
+    }
+    v5 = __builtin_vec_ld(16, (unsigned long long *)e);
+    v6 = __builtin_vec_ld(32, (unsigned long long *)e);
+    v7 = __builtin_vec_ld(48, (unsigned long long *)e);
+    if (c)
+      b = 1;
+  } while (b);
+
+  return dummy(v4);
+}
+
+int test_vec_st(unsigned char *pe) {
+
+  vector unsigned long long v1, v2, v3, v4;
+  vector unsigned long long v5 = ev5;
+  vector unsigned long long v6 = ev6;
+  vector unsigned long long v7 = ev7;
+  vector unsigned long long v8 = ev8;
+
+  unsigned char *e = pe;
+
+  do {
+    if (a) {
+      __builtin_vec_st(v1, 16, (unsigned long long *)e);
+      __builtin_vec_st(v2, 32, (unsigned long long *)e);
+      __builtin_vec_st(v3, 48, (unsigned long long *)e);
+      e = e + 8;
+      for (int i = 0; i < a; i++) {
+        v4 = v5;
+        v5 = __builtin_crypto_vpmsumd(v1, v6);
+        v6 = __builtin_crypto_vpmsumd(v2, v7);
+        v7 = __builtin_crypto_vpmsumd(v3, v8);
+        e = e + 8;
+      }
+    }
+    __builtin_vec_st(v5, 16, (unsigned long long *)e);
+    __builtin_vec_st(v6, 32, (unsigned long long *)e);
+    __builtin_vec_st(v7, 48, (unsigned long long *)e);
+    if (c)
+      b = 1;
+  } while (b);
+
+  return dummy(v4);
+}
+
+/* { dg-final { scan-assembler-not "rldicr\[ \t\]+\[0-9\]+,\[0-9\]+,0,59" } } 
*/

Reply via email to