[PATCH]AArch64: Take into account when VF is higher than known scalar iters

Tamar Christina Fri, 20 Sep 2024 05:06:52 -0700

Hi All,

Consider low overhead loops like:


void
foo (char *restrict a, int *restrict b, int *restrict c, int n)
{
  for (int i = 0; i < 9; i++)
    {
      int res = c[i];
      int t = b[i];
      if (a[i] != 0)
        res = t;
      c[i] = res;
    }
}

For such loops we use latency only costing since the loop bounds is known and
small.

The current costing however does not consider the case where niters < VF.

So when comparing the scalar vs vector costs it doesn't keep in mind that the
scalar code can't perform VF iterations.  This makes it overestimate the cost
for the scalar loop and we incorrectly vectorize.

This patch takes the minimum of the VF and niters in such cases.
Before the patch we generate:

 note:  Original vector body cost = 46
 note:  Vector loop iterates at most 1 times
 note:  Scalar issue estimate:
 note:    load operations = 2
 note:    store operations = 1
 note:    general operations = 1
 note:    reduction latency = 0
 note:    estimated min cycles per iteration = 1.000000
 note:    estimated cycles per vector iteration (for VF 32) = 32.000000
 note:  SVE issue estimate:
 note:    load operations = 5
 note:    store operations = 4
 note:    general operations = 11
 note:    predicate operations = 12
 note:    reduction latency = 0
 note:    estimated min cycles per iteration without predication = 5.500000
 note:    estimated min cycles per iteration for predication = 12.000000
 note:    estimated min cycles per iteration = 12.000000
 note:  Low iteration count, so using pure latency costs
 note:  Cost model analysis:

vs after:

 note:  Original vector body cost = 46
 note:  Known loop bounds, capping VF to 9 for analysis
 note:  Vector loop iterates at most 1 times
 note:  Scalar issue estimate:
 note:    load operations = 2
 note:    store operations = 1
 note:    general operations = 1
 note:    reduction latency = 0
 note:    estimated min cycles per iteration = 1.000000
 note:    estimated cycles per vector iteration (for VF 9) = 9.000000
 note:  SVE issue estimate:
 note:    load operations = 5
 note:    store operations = 4
 note:    general operations = 11
 note:    predicate operations = 12
 note:    reduction latency = 0
 note:    estimated min cycles per iteration without predication = 5.500000
 note:    estimated min cycles per iteration for predication = 12.000000
 note:    estimated min cycles per iteration = 12.000000
 note:  Increasing body cost to 1472 because the scalar code could issue within 
the limit imposed by predicate operations
 note:  Low iteration count, so using pure latency costs
 note:  Cost model analysis:

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues, and fixes the
upcoming vectorization regression on exchange2 in SPECCPU 2017.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64.cc (adjust_body_cost):

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/asrdiv_4.c: Update bounds.
        * gcc.target/aarch64/sve/cond_asrd_2.c: Likewise.
        * gcc.target/aarch64/sve/cond_uxt_6.c: Likewise.
        * gcc.target/aarch64/sve/cond_uxt_7.c: Likewise.
        * gcc.target/aarch64/sve/cond_uxt_8.c: Likewise.
        * gcc.target/aarch64/sve/miniloop_1.c: Likewise.
        * gcc.target/aarch64/sve/spill_6.c: Likewise.
        * gcc.target/aarch64/sve/sve_iters_low_1.c: New test.
        * gcc.target/aarch64/sve/sve_iters_low_2.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
6ccf08d1cc0a1aecfc72f95b105ace2c00b1a51d..afb58fd88795a26064c8c74f337324e3ecebc389
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17565,6 +17565,20 @@ adjust_body_cost (loop_vec_info loop_vinfo,
     dump_printf_loc (MSG_NOTE, vect_location,
                     "Original vector body cost = %d\n", body_cost);
 
+  /* If the iteration count is known and low we use latency only calculation,
+     however if the iteration count is lower than VF then the estimate for the
+     scalar loops will be too high.  Cap it at NITERS.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      if (niters < estimated_vf && dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Scalar loop iterates at most %wd times.  Capping VF "
+                        " from %d to %wd\n", niters, estimated_vf, niters);
+
+      estimated_vf = MIN (estimated_vf, niters);
+    }
+
   fractional_cost scalar_cycles_per_iter
     = scalar_ops.min_cycles_per_iter () * estimated_vf;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
index 
6684fe1c12441399faac941bdca79d90de1eb611..10a96a894afd690cf9eb521ae5195f6669e3e2aa
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
@@ -15,12 +15,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
index 
e4040ee3520c8dd50282e4aeaf4930c7f66c929c..db1721efbc7bd68f0954f1c76eb15ed75dc91cfb
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
@@ -14,12 +14,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
index 
e47276a3a352bd61d2c508167344a2e60e8bc84c..b8b3e862d0a1968ddf9f51278e6b3813e16a7665
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
index 
f49915c4ac1430b0260589156d98b0793c78999f..2d02fb70f33fcf9a76c73b03c23f5a2d33e01b92
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
index 
42eb4b2661b3f152763ac2c8382877e5116dedd4..8fe2455687b5a310179c26c2c6d7c70b33101612
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
index 
09eb4146816cc51af5829f15b6c287aca086c382..cd1fd2b8a078a3a6aa9741aea0d10a7a420e137c
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
@@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * 
__restrict__ c,
           int * __restrict__ g, int * __restrict__ h)
 {
   int i = 0;
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < 30; i++)
     {
       a[i] += i;
       b[i] += i;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
index 
ae9c338f5696a9f743696b58f3d8e1dd991de501..2ff969ced00955403eaa1aea7223e209fdb6f139
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
@@ -11,20 +11,20 @@ void consumer (void *);
   {                                                                    \
     if (which)                                                         \
       {                                                                        
\
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x1[i] += VAL;                                                 \
        consumer (x1);                                                  \
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x2[i] -= VAL;                                                 \
        consumer (x2);                                                  \
       }                                                                        
\
     else                                                               \
       {                                                                        
\
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x3[i] &= VAL;                                                 \
        consumer (x3);                                                  \
       }                                                                        
\
-    for (int i = 0; i < 7; ++i)                                                
\
+    for (int i = 0; i < 70; ++i)                                       \
       x4[i] |= VAL;                                                    \
     consumer (x4);                                                     \
   }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..952a4b1cd580c4eaed4c7c470cd1efba8e5e931d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } 
*/
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n)
+{
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..02d10de2a6215d38728a84123d038c2db605b5a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } 
*/
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
+{
+  if (stride <= 1)
+    return;
+
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i*stride];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */




--

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6ccf08d1cc0a1aecfc72f95b105ace2c00b1a51d..afb58fd88795a26064c8c74f337324e3ecebc389 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17565,6 +17565,20 @@ adjust_body_cost (loop_vec_info loop_vinfo,
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Original vector body cost = %d\n", body_cost);
 
+  /* If the iteration count is known and low we use latency only calculation,
+     however if the iteration count is lower than VF then the estimate for the
+     scalar loops will be too high.  Cap it at NITERS.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      if (niters < estimated_vf && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Scalar loop iterates at most %wd times.  Capping VF "
+			 " from %d to %wd\n", niters, estimated_vf, niters);
+
+      estimated_vf = MIN (estimated_vf, niters);
+    }
+
   fractional_cost scalar_cycles_per_iter
     = scalar_ops.min_cycles_per_iter () * estimated_vf;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
index 6684fe1c12441399faac941bdca79d90de1eb611..10a96a894afd690cf9eb521ae5195f6669e3e2aa 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
@@ -15,12 +15,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
index e4040ee3520c8dd50282e4aeaf4930c7f66c929c..db1721efbc7bd68f0954f1c76eb15ed75dc91cfb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
@@ -14,12 +14,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
index e47276a3a352bd61d2c508167344a2e60e8bc84c..b8b3e862d0a1968ddf9f51278e6b3813e16a7665 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
index f49915c4ac1430b0260589156d98b0793c78999f..2d02fb70f33fcf9a76c73b03c23f5a2d33e01b92 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
index 42eb4b2661b3f152763ac2c8382877e5116dedd4..8fe2455687b5a310179c26c2c6d7c70b33101612 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
index 09eb4146816cc51af5829f15b6c287aca086c382..cd1fd2b8a078a3a6aa9741aea0d10a7a420e137c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
@@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c,
 	   int * __restrict__ g, int * __restrict__ h)
 {
   int i = 0;
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < 30; i++)
     {
       a[i] += i;
       b[i] += i;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
index ae9c338f5696a9f743696b58f3d8e1dd991de501..2ff969ced00955403eaa1aea7223e209fdb6f139 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
@@ -11,20 +11,20 @@ void consumer (void *);
   {									\
     if (which)								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x1[i] += VAL;							\
 	consumer (x1);							\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x2[i] -= VAL;							\
 	consumer (x2);							\
       }									\
     else								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x3[i] &= VAL;							\
 	consumer (x3);							\
       }									\
-    for (int i = 0; i < 7; ++i)						\
+    for (int i = 0; i < 70; ++i)					\
       x4[i] |= VAL;							\
     consumer (x4);							\
   }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..952a4b1cd580c4eaed4c7c470cd1efba8e5e931d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n)
+{
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..02d10de2a6215d38728a84123d038c2db605b5a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
+{
+  if (stride <= 1)
+    return;
+
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i*stride];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */

[PATCH]AArch64: Take into account when VF is higher than known scalar iters

Reply via email to