This fixes a double-counting in the reduction cost when vectorizing
the reduction through the regular vectorizable_* functions.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2021-01-11  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/98526
        * tree-vect-loop.c (vect_model_reduction_cost): Remove costing
        of the actual reduction op for the regular case.
        (vectorizable_reduction): Cost the stmts
        vect_transform_reduction produces here.
---
 gcc/tree-vect-loop.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 965cc164f6e..acfd1952e3b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -4452,8 +4452,8 @@ have_whole_vector_shift (machine_mode mode)
 /* Function vect_model_reduction_cost.
 
    Models cost for a reduction operation, including the vector ops
-   generated within the strip-mine loop, the initial definition before
-   the loop, and the epilogue code that must be generated.  */
+   generated within the strip-mine loop in some cases, the initial
+   definition before the loop, and the epilogue code that must be generated.  
*/
 
 static void
 vect_model_reduction_cost (loop_vec_info loop_vinfo,
@@ -4516,10 +4516,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
                                         scalar_to_vec, stmt_info, 0,
                                         vect_prologue);
-
-      /* Cost of reduction op inside loop.  */
-      inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                     stmt_info, 0, vect_body);
     }
 
   /* Determine cost of epilogue code.
@@ -7268,6 +7264,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
                             reduction_type, ncopies, cost_vec);
+  /* Cost the reduction op inside the loop if transformed via
+     vect_transform_reduction.  Otherwise this is costed by the
+     separate vectorizable_* routines.  */
+  if (single_defuse_cycle
+      || code == DOT_PROD_EXPR
+      || code == WIDEN_SUM_EXPR
+      || code == SAD_EXPR)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
+
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
-- 
2.26.2

Reply via email to