https://gcc.gnu.org/g:facb92812a4ec5c60ef783db6d02c35fa6a21e16
commit r16-5372-gfacb92812a4ec5c60ef783db6d02c35fa6a21e16 Author: Richard Biener <[email protected]> Date: Tue Nov 18 09:59:54 2025 +0100 tree-optimization/122723 - masking of .COND_ADD reductions The following fixes loop masking of .COND_ADD reductions when we decide to reduce multiple lanes to one, thus go through vect_transform_reduction. The first issue is in vect_reduction_update_partial_vector_usage which does not handle incoming .COND_ADD well and fails to compute 'cond_fn' in this case, disabling masking. The second issue is that vect_transform_reduction does not implement the masked but not mask-by-cond case for any .COND_* operation. The following should fix both. The testcases verify runtime in vect.exp and vectorization support in the i386 target section for the combinations of -O3, -Ofast plus masked vs. non-masked epilogues. PR tree-optimization/122723 * tree-vect-loop.cc (vect_reduction_update_partial_vector_usage): Handle incoming .COND_* operation. (vect_transform_reduction): Likewise. Handle .COND_* operation when not using COND_EXPR masking in a masked loop. * gcc.dg/vect/vect-reduc-cond-add-1.c: New generic functional testcase. * gcc.target/i386/vect-epilogues-6.c: New testcase. * gcc.target/i386/vect-epilogues-7.c: Likewise. * gcc.target/i386/vect-epilogues-8.c: Likewise. * gcc.target/i386/vect-epilogues-9.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c | 50 +++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/vect-epilogues-6.c | 21 ++++++++++ gcc/testsuite/gcc.target/i386/vect-epilogues-7.c | 21 ++++++++++ gcc/testsuite/gcc.target/i386/vect-epilogues-8.c | 21 ++++++++++ gcc/testsuite/gcc.target/i386/vect-epilogues-9.c | 21 ++++++++++ gcc/tree-vect-loop.cc | 35 +++++++++++----- 6 files changed, 159 insertions(+), 10 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c new file mode 100644 index 000000000000..1e64df7f6d2b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c @@ -0,0 +1,50 @@ +#include "tree-vect.h" + +char mask[128]; + +double __attribute__((noipa)) +foo (double *a, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum = sum + val; + } + return sum; +} + +double a[128]; + +int main() +{ + check_vect (); + +#pragma GCC novector + for (int i = 0; i < 128; ++i) + { + a[i] = (i * 7) % 15; + mask[i] = (i + 1) & 4; + } + + double sum = foo (a, 87); + double sum2 = 0.0; +#pragma GCC novector + for (int i = 0; i < 87; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum2 = sum2 + val; + } + + if (sum != sum2) + __builtin_abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c new file mode 100644 index 000000000000..8cd8740c6ecc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */ + +double +foo (double *a, char *mask, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum = sum + val; + } + return sum; +} + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 byte vectors" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c new file mode 100644 index 000000000000..63c29895f9bb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */ + +double +foo (double *a, char *mask, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum = sum + val; + } + return sum; +} + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using masked 64 byte vectors" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c new file mode 100644 index 000000000000..ab5d4556ecb8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512 --param vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */ + +double +foo (double *a, char *mask, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum = sum + val; + } + return sum; +} + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 byte vectors" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c new file mode 100644 index 000000000000..72564a8a8824 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512 --param vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */ + +double +foo (double *a, char *mask, int n) +{ + double sum = 0.0; + for (int i = 0; i < n; ++i) + { + double val; + if (mask[i]) + val = a[i]; + else + val = -0.0; + sum = sum + val; + } + return sum; +} + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using masked 64 byte vectors" "vect" } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index b11b4c168ab2..e013d4f98096 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6647,7 +6647,10 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo, { enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info); internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info); - internal_fn cond_fn = get_conditional_internal_fn (code, type); + internal_fn cond_fn + = ((code.is_internal_fn () + && internal_fn_mask_index ((internal_fn)code) != -1) + ? (internal_fn)code : get_conditional_internal_fn (code, type)); if (reduc_type != FOLD_LEFT_REDUCTION && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in) @@ -7871,7 +7874,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo, vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]); code_helper code = canonicalize_code (op.code, op.type); - internal_fn cond_fn = get_conditional_internal_fn (code, op.type); + internal_fn cond_fn + = ((code.is_internal_fn () + && internal_fn_mask_index ((internal_fn)code) != -1) + ? (internal_fn)code : get_conditional_internal_fn (code, op.type)); vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); @@ -8119,17 +8125,26 @@ vect_transform_reduction (loop_vec_info loop_vinfo, yet. */ gcc_assert (!lane_reducing); - /* Make sure that the reduction accumulator is vop[0]. */ - if (reduc_index == 1) - { - gcc_assert (commutative_binary_op_p (code, op.type)); - std::swap (vop[0], vop[1]); - } tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, mask_index++); - gcall *call = gimple_build_call_internal (cond_fn, 4, mask, - vop[0], vop[1], vop[0]); + gcall *call; + if (code.is_internal_fn () && cond_fn_p) + { + gcc_assert (op.num_ops >= 3 + && internal_fn_mask_index (internal_fn (code)) == 0); + vop[2] = vec_oprnds[2][i]; + mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), + mask, vop[0], gsi); + call = gimple_build_call_internal (cond_fn, 4, mask, vop[1], + vop[2], vop[reduc_index]); + } + else + { + gcc_assert (code.is_tree_code ()); + call = gimple_build_call_internal (cond_fn, 4, mask, vop[0], + vop[1], vop[reduc_index]); + } new_temp = make_ssa_name (vec_dest, call); gimple_call_set_lhs (call, new_temp); gimple_call_set_nothrow (call, true);
