On 26/04/2022 15:43, Richard Sandiford wrote:
"Andre Vieira (lists)" <andre.simoesdiasvie...@arm.com> writes:
Hi,
This patch disables epilogue vectorization when we are peeling for
alignment in the prologue and we can't guarantee the main vectorized
loop is entered. This is to prevent executing vectorized code with an
unaligned access if the target has indicated it wants to peel for
alignment. We take this conservative approach as we currently do not
distinguish between peeling for alignment for correctness or for
performance.
A better codegen would be to make it skip to the scalar epilogue in case
the main loop isn't entered when alignment peeling is required. However,
that would require a more aggressive change to the codebase which we
chose to avoid at this point of development. We can revisit this option
during stage 1 if we choose to.
Bootstrapped on aarch64-none-linux and regression tested on
aarch64-none-elf.
gcc/ChangeLog:
PR tree-optimization/105219
* tree-vect-loop.cc (vect_epilogue_when_peeling_for_alignment): New
function.
(vect_analyze_loop): Use vect_epilogue_when_peeling_for_alignment
to determine
whether to vectorize epilogue.
* testsuite/gcc.target/aarch64/pr105219.c: New.
* testsuite/gcc.target/aarch64/pr105219-2.c: New.
* testsuite/gcc.target/aarch64/pr105219-3.c: New.
diff --git a/gcc/testsuite/gcc.target/aarch64/pr105219-2.c
b/gcc/testsuite/gcc.target/aarch64/pr105219-2.c
new file mode 100644
index
0000000000000000000000000000000000000000..c97d1dc100181b77af0766e08407e1e352f604fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr105219-2.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=armv8.2-a -mtune=thunderx -fno-vect-cost-model" }
*/
+/* { dg-skip-if "incompatible options" { *-*-* } { "-march=*" } {
"-march=armv8.2-a" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mtune=*" } {
"-mtune=thunderx" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mcpu=*" } } */
I think this should be in gcc.dg/vect, with the options forced
for { target aarch64 }.
Are the skips necessary? It looks like the test should work correctly
for all options/targets.
The -mtune and -march I guess aren't necessary, but if I drop the -mcpu
skip-if I have to drop the -march option from dg-options as the use
provided -mcpu might conflict with the -march and the test will fail.
+/* PR 105219. */
+int data[128];
+
+void __attribute((noipa))
+foo (int *data, int n)
+{
+ for (int i = 0; i < n; ++i)
+ data[i] = i;
+}
+
+int main()
+{
+ for (int start = 0; start < 16; ++start)
+ for (int n = 1; n < 3*16; ++n)
+ {
+ __builtin_memset (data, 0, sizeof (data));
+ foo (&data[start], n);
+ for (int j = 0; j < n; ++j)
+ if (data[start + j] != j)
+ __builtin_abort ();
+ }
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/pr105219-3.c
b/gcc/testsuite/gcc.target/aarch64/pr105219-3.c
new file mode 100644
index
0000000000000000000000000000000000000000..444352fc051b787369f6f1be6236d1ff0fc2d392
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr105219-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-march=*" } {
"-march=armv8.2-a" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mtune=*" } {
"-mtune=thunderx" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mcpu=*" } } */
+/* { dg-options "-O3 -march=armv8.2-a -mtune=thunderx -fno-vect-cost-model
-fdump-tree-vect-all" } */
+/* PR 105219. */
+int data[128];
+
+void foo (void)
+{
+ for (int i = 0; i < 9; ++i)
+ data[i + 1] = i;
+}
+
+/* { dg-final { scan-tree-dump "EPILOGUE VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr105219.c
b/gcc/testsuite/gcc.target/aarch64/pr105219.c
new file mode 100644
index
0000000000000000000000000000000000000000..bbdefb549f6a4e803852f69d20ce1ef9152a526c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr105219.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target aarch64_sve128_hw } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-march=*" } {
"-march=armv8.2-a+sve" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mtune=*" } {
"-mtune=thunderx" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-mcpu=*" } } */
+/* { dg-skip-if "incompatible options" { *-*-* } { "-msve-vector-bits=*" } {
"-msve-vector-bits=128" } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -msve-vector-bits=128
-mtune=thunderx" } */
Same here.
Here the reason is even stronger as if the user provides a different
-msve-vector-bits the test will fail at run-time too (given we are
requesting 128bit hardware).
Also these were the conditions required for this test to fail, I could
leave out this altogether ofc and only keep richi's test.
+/* PR 105219. */
+int a;
+char b[60];
+short c[18];
+short d[4][19];
+long long f;
+void e(int g, int h, short k[][19]) {
+ for (signed i = 0; i < 3; i += 2)
+ for (signed j = 1; j < h + 14; j++) {
+ b[i * 14 + j] = 1;
+ c[i + j] = k[2][j];
+ a = g ? k[i][j] : 0;
+ }
+}
+int main() {
+ e(9, 1, d);
+ for (long l = 0; l < 6; ++l)
+ for (long m = 0; m < 4; ++m)
+ f ^= b[l + m * 4];
+ if (f)
+ __builtin_abort ();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index
d7bc34636bd52b2f67cdecd3dc16fcff684dba07..a23e6181dec8126bcb691ea9474095bf65483863
100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2942,6 +2942,38 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared
*shared,
return opt_loop_vec_info::success (loop_vinfo);
}
+/* Function vect_epilogue_when_peeling_for_alignment
+
+ PR 105219: If we are peeling for alignment in the prologue then we do not
+ vectorize the epilogue unless we are certain we will enter the main
+ vectorized loop. This is to prevent entering the vectorized epilogue in
+ case there aren't enough iterations to enter the main loop.
+*/
+
+static bool
+vect_epilogue_when_peeling_for_alignment (loop_vec_info loop_vinfo)
+{
+ if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+ return true;
+
+ int prologue_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (prologue_peeling > 0 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ {
+ poly_uint64 niters_for_main
+ = upper_bound (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo));
+ niters_for_main
+ = upper_bound (niters_for_main,
+ LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo));
+ niters_for_main += prologue_peeling;
+ if (maybe_le (LOOP_VINFO_INT_NITERS (loop_vinfo), niters_for_main))
+ return false;
+ }
+ else if (prologue_peeling < 0)
I was surprised that this tests < 0 rather than != 0. If that's the
right behaviour, could you add a comment saying why? I would have
expected:
prologue_peeling > 0 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
to be handled more conservatively than:
prologue_peeling > 0 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
Oopsie... You are right, if prologue_peeling > 0 and
!LOOP_VINFO_NITERS_KNOWN_P it should also return false. I'll make the
change.
LGTM otherwise, but Richard B should have the final say.
Thanks,
Richard
+ return false;
+ return true;
+}
+
/* Function vect_analyze_loop.
Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -3151,7 +3183,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
}
}
/* For now only allow one epilogue loop. */
- if (first_loop_vinfo->epilogue_vinfos.is_empty ())
+ if (first_loop_vinfo->epilogue_vinfos.is_empty ()
+ && vect_epilogue_when_peeling_for_alignment (first_loop_vinfo))
{
first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);