The following fixes loop masking of .COND_ADD reductions when
we decide to reduce multiple lanes to one, thus go through
vect_transform_reduction. The first issue is in
vect_reduction_update_partial_vector_usage which does not handle
incoming .COND_ADD well and fails to compute 'cond_fn' in this
case, disabling masking. The second issue is that
vect_transform_reduction does not implement the masked but
not mask-by-cond case for any .COND_* operation. The following
should fix both.
The testcases verify runtime in vect.exp and vectorization support
in the i386 target section for the combinations of -O3, -Ofast
plus masked vs. non-masked epilogues.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
PR tree-optimization/122723
* tree-vect-loop.cc (vect_reduction_update_partial_vector_usage):
Handle incoming .COND_* operation.
(vect_transform_reduction): Likewise. Handle .COND_*
operation when not using COND_EXPR masking in a masked loop.
* gcc.dg/vect/vect-reduc-cond-add-1.c: New generic functional
testcase.
* gcc.target/i386/vect-epilogues-6.c: New testcase.
* gcc.target/i386/vect-epilogues-7.c: Likewise.
* gcc.target/i386/vect-epilogues-8.c: Likewise.
* gcc.target/i386/vect-epilogues-9.c: Likewise.
---
.../gcc.dg/vect/vect-reduc-cond-add-1.c | 50 +++++++++++++++++++
.../gcc.target/i386/vect-epilogues-6.c | 21 ++++++++
.../gcc.target/i386/vect-epilogues-7.c | 21 ++++++++
.../gcc.target/i386/vect-epilogues-8.c | 21 ++++++++
.../gcc.target/i386/vect-epilogues-9.c | 21 ++++++++
gcc/tree-vect-loop.cc | 35 +++++++++----
6 files changed, 159 insertions(+), 10 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
new file mode 100644
index 00000000000..1e64df7f6d2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+char mask[128];
+
+double __attribute__((noipa))
+foo (double *a, int n)
+{
+ double sum = 0.0;
+ for (int i = 0; i < n; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum = sum + val;
+ }
+ return sum;
+}
+
+double a[128];
+
+int main()
+{
+ check_vect ();
+
+#pragma GCC novector
+ for (int i = 0; i < 128; ++i)
+ {
+ a[i] = (i * 7) % 15;
+ mask[i] = (i + 1) & 4;
+ }
+
+ double sum = foo (a, 87);
+ double sum2 = 0.0;
+#pragma GCC novector
+ for (int i = 0; i < 87; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum2 = sum2 + val;
+ }
+
+ if (sum != sum2)
+ __builtin_abort ();
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
new file mode 100644
index 00000000000..8cd8740c6ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param
vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+ double sum = 0.0;
+ for (int i = 0; i < n; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum = sum + val;
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32
byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
new file mode 100644
index 00000000000..63c29895f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+ double sum = 0.0;
+ for (int i = 0; i < n; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum = sum + val;
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using
masked 64 byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
new file mode 100644
index 00000000000..ab5d4556ecb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512
--param vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+ double sum = 0.0;
+ for (int i = 0; i < n; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum = sum + val;
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32
byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
new file mode 100644
index 00000000000..72564a8a882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512
--param vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+ double sum = 0.0;
+ for (int i = 0; i < n; ++i)
+ {
+ double val;
+ if (mask[i])
+ val = a[i];
+ else
+ val = -0.0;
+ sum = sum + val;
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using
masked 64 byte vectors" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b11b4c168ab..e013d4f9809 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6647,7 +6647,10 @@ vect_reduction_update_partial_vector_usage
(loop_vec_info loop_vinfo,
{
enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
- internal_fn cond_fn = get_conditional_internal_fn (code, type);
+ internal_fn cond_fn
+ = ((code.is_internal_fn ()
+ && internal_fn_mask_index ((internal_fn)code) != -1)
+ ? (internal_fn)code : get_conditional_internal_fn (code, type));
if (reduc_type != FOLD_LEFT_REDUCTION
&& !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
@@ -7871,7 +7874,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
code_helper code = canonicalize_code (op.code, op.type);
- internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+ internal_fn cond_fn
+ = ((code.is_internal_fn ()
+ && internal_fn_mask_index ((internal_fn)code) != -1)
+ ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
@@ -8119,17 +8125,26 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
yet. */
gcc_assert (!lane_reducing);
- /* Make sure that the reduction accumulator is vop[0]. */
- if (reduc_index == 1)
- {
- gcc_assert (commutative_binary_op_p (code, op.type));
- std::swap (vop[0], vop[1]);
- }
tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
vec_num, vectype_in,
mask_index++);
- gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
- vop[0], vop[1], vop[0]);
+ gcall *call;
+ if (code.is_internal_fn () && cond_fn_p)
+ {
+ gcc_assert (op.num_ops >= 3
+ && internal_fn_mask_index (internal_fn (code)) == 0);
+ vop[2] = vec_oprnds[2][i];
+ mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
+ mask, vop[0], gsi);
+ call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
+ vop[2], vop[reduc_index]);
+ }
+ else
+ {
+ gcc_assert (code.is_tree_code ());
+ call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
+ vop[1], vop[reduc_index]);
+ }
new_temp = make_ssa_name (vec_dest, call);
gimple_call_set_lhs (call, new_temp);
gimple_call_set_nothrow (call, true);
--
2.51.0