On Fri, 18 Oct 2019 at 14:36, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > Hi, > > The attached patch tries to fix PR91272. > > Does it look OK ? > > > > With patch, I see following failures for aarch64-sve.exp: > > FAIL: gcc.target/aarch64/sve/clastb_1.c -march=armv8.2-a+sve > > scan-assembler \\tclastb\\tw[0-9]+, p[0-7], w[0-9]+, z[0-9]+\\.s > > FAIL: gcc.target/aarch64/sve/clastb_2.c -march=armv8.2-a+sve > > scan-assembler \\tclastb\\tw[0-9]+, p[0-7]+, w[0-9]+, z[0-9]+\\.s > > FAIL: gcc.target/aarch64/sve/clastb_3.c -march=armv8.2-a+sve > > scan-assembler \\tclastb\\tw[0-9]+, p[0-7]+, w[0-9]+, z[0-9]+\\.b > > FAIL: gcc.target/aarch64/sve/clastb_5.c -march=armv8.2-a+sve > > scan-assembler \\tclastb\\tx[0-9]+, p[0-7], x[0-9]+, z[0-9]+\\.d > > > > For instance, in clastb_1.c, it now emits: > > clastb s1, p1, s1, z0.s > > while using a fully predicated loop. > > Should I adjust the tests ? > > Yeah, that's an improvement really. > > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > > index acdd90784dc..2cad2cb94c8 100644 > > --- a/gcc/tree-vect-stmts.c > > +++ b/gcc/tree-vect-stmts.c > > @@ -10016,7 +10016,8 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > /* See whether another part of the vectorized code applies a loop > > mask to the condition, or to its inverse. */ > > > > - if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > > + && reduction_type != EXTRACT_LAST_REDUCTION) > > { > > scalar_cond_masked_key cond (cond_expr, ncopies); > > if (loop_vinfo->scalar_cond_masked_set.contains (cond)) > > The context here is: > > if (loop_vinfo->scalar_cond_masked_set.contains (cond)) > { > vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, > j); > } > else > { > bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); > cond.code = invert_tree_comparison (cond.code, honor_nans); > if (loop_vinfo->scalar_cond_masked_set.contains (cond)) > { > vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > loop_mask = vect_get_loop_mask (gsi, masks, ncopies, > vectype, j); > cond_code = cond.code; > swap_cond_operands = true; > } > } > > Rather than have another instance of vect_get_loop_mask, I think > it would cleaner to use a nonnull "masks" to decide whether to apply > the loop mask: > > vec_loop_masks *masks = NULL; > if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > { > if (reduction_type == EXTRACT_LAST_REDUCTION > || loop_vinfo->scalar_cond_masked_set.contains (cond)) > masks = &LOOP_VINFO_MASKS (loop_vinfo); > else > { > ... > } > > Then: > > > @@ -10116,6 +10117,15 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > vec_then_clause = vec_oprnds2[i]; > > vec_else_clause = vec_oprnds3[i]; > > > > + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > > + && reduction_type == EXTRACT_LAST_REDUCTION) > > + { > > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > > + unsigned vec_num = vec_oprnds0.length (); > > + loop_mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, > > + vectype, vec_num * j + i); > > + } > > + > > ...do this vect_get_loop_mask under the condition of "if (masks)". > > > if (swap_cond_operands) > > std::swap (vec_then_clause, vec_else_clause); > > > > @@ -10180,7 +10190,7 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > vec != { 0, ... } (masked in the MASK_LOAD, > > unmasked in the VEC_COND_EXPR). */ > > > > - if (loop_mask) > > + if (loop_mask && reduction_type != EXTRACT_LAST_REDUCTION) > > { > > if (COMPARISON_CLASS_P (vec_compare)) > > { > > @@ -10220,6 +10230,16 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > > vec_compare = vec_compare_name; > > } > > The code above here: > > if (!is_gimple_val (vec_compare)) > { > tree vec_compare_name = make_ssa_name (vec_cmp_type); > gassign *new_stmt = gimple_build_assign (vec_compare_name, > vec_compare); > vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > vec_compare = vec_compare_name; > } > > is doing something similar to the new COND_EXPR handling: > > if (COMPARISON_CLASS_P (vec_compare)) > { > tree tmp = make_ssa_name (vec_cmp_type); > tree op0 = TREE_OPERAND (vec_compare, 0); > tree op1 = TREE_OPERAND (vec_compare, 1); > gassign *g = gimple_build_assign (tmp, > TREE_CODE (vec_compare), > op0, op1); > vect_finish_stmt_generation (stmt_info, g, gsi); > vec_compare = tmp; > } > > There's then an added case: > > if (must_invert_cmp_result) > { > tree vec_compare_name = make_ssa_name (vec_cmp_type); > gassign *new_stmt = gimple_build_assign (vec_compare_name, > BIT_NOT_EXPR, > vec_compare); > vect_finish_stmt_generation (stmt_info, new_stmt, gsi); > vec_compare = vec_compare_name; > } > > that would be a safe no-op for the normal COND_EXPR path (because > must_invert_cmp_result is always false then). Then this: > > > + > > + if (loop_mask) > > + { > > + tree tmp = make_ssa_name (vec_cmp_type); > > + gassign *g = gimple_build_assign (tmp, BIT_AND_EXPR, > > + vec_compare, loop_mask); > > + vect_finish_stmt_generation (stmt_info, g, gsi); > > + vec_compare = tmp; > > + } > > + > > is the equivalent of the above: > > tree tmp2 = make_ssa_name (vec_cmp_type); > gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, > vec_compare, loop_mask); > vect_finish_stmt_generation (stmt_info, g, gsi); > vec_compare = tmp2; > > So with this patch, I think the EXTRACT_LAST_REDUCTION and the normal > COND_EXPR paths should be able to share the mask generation code. Hi Richard, Thanks for the suggestions, does the attached patch look OK ? A quick comparison of aarch64-sve.exp shows no regressions, bootstrap+test in progress.
Thanks, Prathamesh > > Thanks, > Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_1.c index d4f9b0b6a94..d3ea52dea47 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_1.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define N 32 @@ -17,4 +17,5 @@ condition_reduction (int *a, int min_v) return last; } -/* { dg-final { scan-assembler {\tclastb\tw[0-9]+, p[0-7], w[0-9]+, z[0-9]+\.s} } } */ +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ +/* { dg-final { scan-assembler {\tclastb\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_2.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_2.c index 2c49bd3b0f0..c222b707912 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_2.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #include <stdint.h> @@ -23,4 +23,5 @@ condition_reduction (TYPE *a, TYPE min_v) return last; } -/* { dg-final { scan-assembler {\tclastb\tw[0-9]+, p[0-7]+, w[0-9]+, z[0-9]+\.s} } } */ +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ +/* { dg-final { scan-assembler {\tclastb\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_3.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_3.c index 35344f446c6..5aaa71f948d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_3.c @@ -1,8 +1,9 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define TYPE uint8_t #include "clastb_2.c" -/* { dg-final { scan-assembler {\tclastb\tw[0-9]+, p[0-7]+, w[0-9]+, z[0-9]+\.b} } } */ +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ +/* { dg-final { scan-assembler {\tclastb\tb[0-9]+, p[0-7], b[0-9]+, z[0-9]+\.b} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_4.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_4.c index ce58abd6161..b4db170ea06 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_4.c @@ -1,8 +1,9 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define TYPE int16_t #include "clastb_2.c" +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ /* { dg-final { scan-assembler {\tclastb\tw[0-9]+, p[0-7], w[0-9]+, z[0-9]+\.h} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_5.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_5.c index 2b9783d6627..28d40a01f93 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_5.c @@ -1,8 +1,9 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define TYPE uint64_t #include "clastb_2.c" -/* { dg-final { scan-assembler {\tclastb\tx[0-9]+, p[0-7], x[0-9]+, z[0-9]+\.d} } } */ +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ +/* { dg-final { scan-assembler {\tclastb\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_6.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_6.c index c47d303f730..38632a21be1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_6.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define N 32 @@ -21,4 +21,5 @@ condition_reduction (TYPE *a, TYPE min_v) return last; } +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ /* { dg-final { scan-assembler {\tclastb\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_7.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_7.c index 3345f874a39..e5307d2edc8 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_7.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_7.c @@ -1,7 +1,8 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ #define TYPE double #include "clastb_6.c" +/* { dg-final { scan-tree-dump "using a fully-masked loop." "vect" } } */ /* { dg-final { scan-assembler {\tclastb\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c index d86a428a7fa..583fc8d8d6d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c @@ -1,5 +1,5 @@ /* { dg-do assemble { target aarch64_asm_sve_ok } } */ -/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -msve-vector-bits=256 --save-temps" } */ #include <stdint.h> @@ -19,6 +19,7 @@ TEST_TYPE (uint16_t); TEST_TYPE (uint32_t); TEST_TYPE (uint64_t); +/* { dg-final { scan-tree-dump-times "using a fully-masked loop." 4 "vect" } } */ /* { dg-final { scan-assembler {\tclastb\t(b[0-9]+), p[0-7], \1, z[0-9]+\.b\n} } } */ /* { dg-final { scan-assembler {\tclastb\t(h[0-9]+), p[0-7], \1, z[0-9]+\.h\n} } } */ /* { dg-final { scan-assembler {\tclastb\t(s[0-9]+), p[0-7], \1, z[0-9]+\.s\n} } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a70d52eb2ca..82814e2c2af 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -6428,6 +6428,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) { if (reduction_type != FOLD_LEFT_REDUCTION + && reduction_type != EXTRACT_LAST_REDUCTION && !mask_by_cond_expr && (cond_fn == IFN_LAST || !direct_internal_fn_supported_p (cond_fn, vectype_in, diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index acdd90784dc..dfd33b142ed 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -10016,25 +10016,26 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* See whether another part of the vectorized code applies a loop mask to the condition, or to its inverse. */ + vec_loop_masks *masks = NULL; if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { - scalar_cond_masked_key cond (cond_expr, ncopies); - if (loop_vinfo->scalar_cond_masked_set.contains (cond)) - { - vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); - loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); - } + if (reduction_type == EXTRACT_LAST_REDUCTION) + masks = &LOOP_VINFO_MASKS (loop_vinfo); else { - bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); - cond.code = invert_tree_comparison (cond.code, honor_nans); + scalar_cond_masked_key cond (cond_expr, ncopies); if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + masks = &LOOP_VINFO_MASKS (loop_vinfo); + else { - vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); - loop_mask = vect_get_loop_mask (gsi, masks, ncopies, - vectype, j); - cond_code = cond.code; - swap_cond_operands = true; + bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + cond.code = invert_tree_comparison (cond.code, honor_nans); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + masks = &LOOP_VINFO_MASKS (loop_vinfo); + cond_code = cond.code; + swap_cond_operands = true; + } } } } @@ -10116,6 +10117,13 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_then_clause = vec_oprnds2[i]; vec_else_clause = vec_oprnds3[i]; + if (masks) + { + unsigned vec_num = vec_oprnds0.length (); + loop_mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + vectype, vec_num * j + i); + } + if (swap_cond_operands) std::swap (vec_then_clause, vec_else_clause); @@ -10194,23 +10202,6 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_compare = tmp; } - tree tmp2 = make_ssa_name (vec_cmp_type); - gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, - vec_compare, loop_mask); - vect_finish_stmt_generation (stmt_info, g, gsi); - vec_compare = tmp2; - } - - if (reduction_type == EXTRACT_LAST_REDUCTION) - { - if (!is_gimple_val (vec_compare)) - { - tree vec_compare_name = make_ssa_name (vec_cmp_type); - gassign *new_stmt = gimple_build_assign (vec_compare_name, - vec_compare); - vect_finish_stmt_generation (stmt_info, new_stmt, gsi); - vec_compare = vec_compare_name; - } if (must_invert_cmp_result) { tree vec_compare_name = make_ssa_name (vec_cmp_type); @@ -10220,6 +10211,16 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vect_finish_stmt_generation (stmt_info, new_stmt, gsi); vec_compare = vec_compare_name; } + + tree tmp2 = make_ssa_name (vec_cmp_type); + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, + vec_compare, loop_mask); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp2; + } + + if (reduction_type == EXTRACT_LAST_REDUCTION) + { gcall *new_stmt = gimple_build_call_internal (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare, vec_then_clause);