[PATCH]middle-end: Fix incorrect codegen with PFA and VLS [PR119351]

Tamar Christina Tue, 15 Apr 2025 02:37:57 -0700

Hi All,

The following example:


#define N 512
#define START 2
#define END 505

int x[N] __attribute__((aligned(32)));

int __attribute__((noipa))
foo (void)
{
  for (signed int i = START; i < END; ++i)
    {
      if (x[i] == 0)
        return i;
    }
  return -1;
}

generates incorrect code with fixed length SVE because for early break we need
to know which value to start the scalar loop with if we take an early exit.

Historically this means that we take the first element of every induction.
this is because there's an assumption in place, that even with masked loops the
masks come from a whilel* instruction.

As such we reduce using a BIT_FIELD_REF <, 0>.

When PFA was added this assumption was correct for non-masked loop, however we
assumed that PFA for VLA wouldn't work for now, and disabled it using the
alignment requirement checks.  We also expected VLS to PFA using scalar loops.

However as this PR shows, for VLS the vectorizer can, and does in some
circumstances choose to peel using masks by masking the first iteration of the
loop with an additional alignment mask.

When this is done, the first elements of the predicate can be inactive. In this
example element 1 is inactive based on the calculated misalignment.  hence the
-1 value in the first vector IV element.

When we reduce using BIT_FIELD_REF we get the wrong value.

This patch updates it by creating a new scalar PHI that keeps track of whether
we are the first iteration of the loop (with the additional masking) or whether
we have taken a loop iteration already.

The generated sequence:

pre-header:
  bb1:
    i_1 = <number of leading inactive elements>

header:
  bb2:
    i_2 = PHI <i_1(bb1), 0(latch)>
    …

early-exit:
  bb3:
    i_3 = iv_step * i_2 + PHI<vector-iv>

Which eliminates the need to do an expensive mask based reduction.

This fixes gromacs with one OpenMP thread. But with > 1 there is still an issue.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        PR tree-optimization/119351
        * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Reject PFA
        with masking with non-linear IVs.
        * tree-vect-loop.cc (vectorizable_induction): Support PFA for masking.

gcc/testsuite/ChangeLog:

        PR tree-optimization/119351
        * gcc.target/aarch64/sve/peel_ind_10.c: New test.
        * gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
        * gcc.target/aarch64/sve/peel_ind_5.c: New test.
        * gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
        * gcc.target/aarch64/sve/peel_ind_6.c: New test.
        * gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
        * gcc.target/aarch64/sve/peel_ind_7.c: New test.
        * gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
        * gcc.target/aarch64/sve/peel_ind_8.c: New test.
        * gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
        * gcc.target/aarch64/sve/peel_ind_9.c: New test.
        * gcc.target/aarch64/sve/peel_ind_9_run.c: New test.

---
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
new file mode 100644
index 
0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb120c54ba15019832cf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 0
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1af61ccec91b51cdb42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_10.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
new file mode 100644
index 
0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb22c8bb02b99644239e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 2
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f26befeab7e53561f84b037aec857b44cf018456
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_5.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf98508fc8af08042ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad487db37e1e5838b87ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_6.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0182e131a173b7b05e88c3393ba854b2da25c6b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..05608dd85f13912f8555ac3f39284f6894875998
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_7.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
new file mode 100644
index 
0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c461b4a4f22b45dfba8e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; i*=2)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f6699d395ab2726dec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_8.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
new file mode 100644
index 
0000000000000000000000000000000000000000..cc904e88170f072e1d3c6be86643d99a7cd5cb12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
@@ -0,0 +1,25 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (int *p = x + START; p < x + END; p++)
+    {
+      if (*p == 0)
+        return START;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* Peels using a scalar loop.  */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c5428c20175ed176d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "peel_ind_9.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 
56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..0722679d6e66e5dd5af4ec1ce591f7c38b76d07f
 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2195,6 +2195,22 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
       return false;
     }
 
+  /* With early break vectorization we don't know whether the accesses will 
stay
+     inside the loop or not.  TODO: The early break adjustment code can be
+     implemented the same way for vectorizable_linear_induction.  However we
+     can't test this today so reject it.  */
+  if (niters_skip != NULL_TREE
+      && vect_use_loop_mask_for_alignment_p (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Peeling for alignement using masking is not supported"
+                        " for nonlinear induction when using early breaks.\n");
+      return false;
+    }
+
   return true;
 }
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
9413dcef702597ab27165e676546b190e2bd36ba..6dcdee19bb250993d8cc6b0057d2fa46245d04d9
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10678,6 +10678,104 @@ vectorizable_induction (loop_vec_info loop_vinfo,
                                       LOOP_VINFO_MASK_SKIP_NITERS 
(loop_vinfo));
          peel_mul = gimple_build_vector_from_val (&init_stmts,
                                                   step_vectype, peel_mul);
+
+         /* If early break then we have to create a new PHI which we can use as
+           an offset to adjust the induction reduction in early exits.  */
+         if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+           {
+             auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+             tree ty_skip_niters = TREE_TYPE (skip_niters);
+             tree break_lhs_phi = NULL_TREE;
+             break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
+                                                    vect_scalar_var,
+                                                    "pfa_iv_offset");
+             gphi *nphi = create_phi_node (break_lhs_phi, bb);
+             add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
+             add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
+                          loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
+
+             /* Rewrite all the early exit usages.  */
+             tree phi_lhs = PHI_RESULT (phi);
+             imm_use_iterator iter;
+             use_operand_p use_p;
+             gimple *use_stmt;
+
+             FOR_EACH_IMM_USE_FAST (use_p, iter, phi_lhs)
+               {
+                 use_stmt = USE_STMT (use_p);
+                 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))
+                     && is_a <gphi *> (use_stmt))
+                   {
+                     auto gsi = gsi_last_bb (use_stmt->bb);
+                     for (auto exit : get_loop_exit_edges (iv_loop))
+                       if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo)
+                           && bb == exit->src)
+                         {
+                           /* Now create the PHI for the outside loop usage to
+                              retrieve the value for the offset counter.  */
+                           tree rphi_lhs = make_ssa_name (ty_skip_niters);
+                           gphi *rphi
+                             = create_phi_node (rphi_lhs, use_stmt->bb);
+                           for (unsigned i = 0; i < gimple_phi_num_args (rphi);
+                                i++)
+                             SET_PHI_ARG_DEF (rphi, i, PHI_RESULT (nphi));
+
+                           tree tmp = make_ssa_name (TREE_TYPE (phi_lhs));
+                           tree stmt_lhs = PHI_RESULT (use_stmt);
+                           imm_use_iterator iter2;
+                           gimple *use_stmt2;
+                           use_operand_p use2_p;
+
+                           /* Now rewrite all the usages first.  */
+                           FOR_EACH_IMM_USE_STMT (use_stmt2, iter2, stmt_lhs)
+                             FOR_EACH_IMM_USE_ON_STMT (use2_p, iter2)
+                               SET_USE (use2_p, tmp);
+
+                           /* And then generate the adjustment to avoid the
+                              update code from updating this new usage.  The
+                              multiplicaiton is to get the original IV and the
+                              downwards counting IV correct.  */
+                           gimple_seq iv_stmts = NULL;
+                           tree rphi_step
+                             = gimple_convert (&iv_stmts, ty_skip_niters,
+                                               step_expr);
+                           tree tmp2
+                             = gimple_build (&iv_stmts, MULT_EXPR,
+                                             ty_skip_niters, rphi_step,
+                                             PHI_RESULT (rphi));
+
+                           if (POINTER_TYPE_P (TREE_TYPE (stmt_lhs)))
+                             tmp2
+                               = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
+                                               TREE_TYPE (stmt_lhs), stmt_lhs,
+                                               tmp2);
+                           else
+                             {
+                               tmp2
+                                 = gimple_convert (&iv_stmts,
+                                                   TREE_TYPE (stmt_lhs),
+                                                   tmp2);
+                               tmp2
+                                 = gimple_build (&iv_stmts, PLUS_EXPR,
+                                                 TREE_TYPE (stmt_lhs), 
stmt_lhs,
+                                                 tmp2);
+                             }
+
+                           gsi_insert_seq_before (&gsi, iv_stmts,
+                                                  GSI_SAME_STMT);
+                           gimple *cvt_stmt =
+                             gimple_build_assign (tmp, VIEW_CONVERT_EXPR,
+                                                  build1 (VIEW_CONVERT_EXPR,
+                                                          TREE_TYPE (phi_lhs),
+                                                          tmp2));
+                           gsi_insert_before (&gsi, cvt_stmt, GSI_SAME_STMT);
+                         }
+                     /* All early exits point to the same common block, so we
+                        only have to find the first one.  */
+                     break;
+                   }
+               }
+           }
        }
       tree step_mul = NULL_TREE;
       unsigned ivn;


--

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
new file mode 100644
index 0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb120c54ba15019832cf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 0
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1af61ccec91b51cdb42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_10.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb22c8bb02b99644239e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 2
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..f26befeab7e53561f84b037aec857b44cf018456
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_5.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
new file mode 100644
index 0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf98508fc8af08042ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+  for (unsigned int i = start; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad487db37e1e5838b87ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_6.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo (START);
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
new file mode 100644
index 0000000000000000000000000000000000000000..0182e131a173b7b05e88c3393ba854b2da25c6b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..05608dd85f13912f8555ac3f39284f6894875998
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_7.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
new file mode 100644
index 0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c461b4a4f22b45dfba8e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
@@ -0,0 +1,24 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (unsigned int i = START; i < END; i*=2)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f6699d395ab2726dec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_8.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc904e88170f072e1d3c6be86643d99a7cd5cb12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
@@ -0,0 +1,25 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (int *p = x + START; p < x + END; p++)
+    {
+      if (*p == 0)
+        return START;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* Peels using a scalar loop.  */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c5428c20175ed176d96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
@@ -0,0 +1,17 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_9.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..0722679d6e66e5dd5af4ec1ce591f7c38b76d07f 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2195,6 +2195,22 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
       return false;
     }
 
+  /* With early break vectorization we don't know whether the accesses will stay
+     inside the loop or not.  TODO: The early break adjustment code can be
+     implemented the same way for vectorizable_linear_induction.  However we
+     can't test this today so reject it.  */
+  if (niters_skip != NULL_TREE
+      && vect_use_loop_mask_for_alignment_p (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "Peeling for alignement using masking is not supported"
+			 " for nonlinear induction when using early breaks.\n");
+      return false;
+    }
+
   return true;
 }
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9413dcef702597ab27165e676546b190e2bd36ba..6dcdee19bb250993d8cc6b0057d2fa46245d04d9 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10678,6 +10678,104 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 				       LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
 	  peel_mul = gimple_build_vector_from_val (&init_stmts,
 						   step_vectype, peel_mul);
+
+	  /* If early break then we have to create a new PHI which we can use as
+	    an offset to adjust the induction reduction in early exits.  */
+	  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	    {
+	      auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+	      tree ty_skip_niters = TREE_TYPE (skip_niters);
+	      tree break_lhs_phi = NULL_TREE;
+	      break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
+						     vect_scalar_var,
+						     "pfa_iv_offset");
+	      gphi *nphi = create_phi_node (break_lhs_phi, bb);
+	      add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
+	      add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
+			   loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
+
+	      /* Rewrite all the early exit usages.  */
+	      tree phi_lhs = PHI_RESULT (phi);
+	      imm_use_iterator iter;
+	      use_operand_p use_p;
+	      gimple *use_stmt;
+
+	      FOR_EACH_IMM_USE_FAST (use_p, iter, phi_lhs)
+		{
+		  use_stmt = USE_STMT (use_p);
+		  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))
+		      && is_a <gphi *> (use_stmt))
+		    {
+		      auto gsi = gsi_last_bb (use_stmt->bb);
+		      for (auto exit : get_loop_exit_edges (iv_loop))
+			if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo)
+			    && bb == exit->src)
+			  {
+			    /* Now create the PHI for the outside loop usage to
+			       retrieve the value for the offset counter.  */
+			    tree rphi_lhs = make_ssa_name (ty_skip_niters);
+			    gphi *rphi
+			      = create_phi_node (rphi_lhs, use_stmt->bb);
+			    for (unsigned i = 0; i < gimple_phi_num_args (rphi);
+				 i++)
+			      SET_PHI_ARG_DEF (rphi, i, PHI_RESULT (nphi));
+
+			    tree tmp = make_ssa_name (TREE_TYPE (phi_lhs));
+			    tree stmt_lhs = PHI_RESULT (use_stmt);
+			    imm_use_iterator iter2;
+			    gimple *use_stmt2;
+			    use_operand_p use2_p;
+
+			    /* Now rewrite all the usages first.  */
+			    FOR_EACH_IMM_USE_STMT (use_stmt2, iter2, stmt_lhs)
+			      FOR_EACH_IMM_USE_ON_STMT (use2_p, iter2)
+				SET_USE (use2_p, tmp);
+
+			    /* And then generate the adjustment to avoid the
+			       update code from updating this new usage.  The
+			       multiplicaiton is to get the original IV and the
+			       downwards counting IV correct.  */
+			    gimple_seq iv_stmts = NULL;
+			    tree rphi_step
+			      = gimple_convert (&iv_stmts, ty_skip_niters,
+						step_expr);
+			    tree tmp2
+			      = gimple_build (&iv_stmts, MULT_EXPR,
+					      ty_skip_niters, rphi_step,
+					      PHI_RESULT (rphi));
+
+			    if (POINTER_TYPE_P (TREE_TYPE (stmt_lhs)))
+			      tmp2
+				= gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
+						TREE_TYPE (stmt_lhs), stmt_lhs,
+						tmp2);
+			    else
+			      {
+				tmp2
+				  = gimple_convert (&iv_stmts,
+						    TREE_TYPE (stmt_lhs),
+						    tmp2);
+				tmp2
+				  = gimple_build (&iv_stmts, PLUS_EXPR,
+						  TREE_TYPE (stmt_lhs), stmt_lhs,
+						  tmp2);
+			      }
+
+			    gsi_insert_seq_before (&gsi, iv_stmts,
+						   GSI_SAME_STMT);
+			    gimple *cvt_stmt =
+			      gimple_build_assign (tmp, VIEW_CONVERT_EXPR,
+						   build1 (VIEW_CONVERT_EXPR,
+							   TREE_TYPE (phi_lhs),
+							   tmp2));
+			    gsi_insert_before (&gsi, cvt_stmt, GSI_SAME_STMT);
+			  }
+		      /* All early exits point to the same common block, so we
+			 only have to find the first one.  */
+		      break;
+		    }
+		}
+	    }
 	}
       tree step_mul = NULL_TREE;
       unsigned ivn;

[PATCH]middle-end: Fix incorrect codegen with PFA and VLS [PR119351]

Reply via email to