This fixes PR83887 where current GRAPHITE SESE region merging sometimes finds non-SESE regions as a result.
It re-implements SESE region merging by searching for the entry/exit with a worklist based algorithm seeded by the entry/exit block of the to be merged regions, walking predecessors and successors and using dominator information to see whether we deal with a (possible) entry or exit edge. The patch misses optimization in that known SESE regions can be skipped in the walk (and trivially known ones are single exit loops). While easily added during iteration I've yet think about a way to handle the case where the initial seeds are entries of such regions. I probably also should add some comments. Bootstrapped and tested on x86_64-unknown-linux-gnu, I've also built SPEC CPU 2006 with -floop-nest-optimize [-fgraphite-identity] with no new issues and 4 less optimized loop nests (maybe we did get away with some invalid SESE regions - I did not investigate yet). Comments welcome. Thanks, Richard. 2018-01-17 Richard Biener <rguent...@suse.de> PR tree-optimization/83887 * graphite-scop-detection.c (scop_detection::get_nearest_dom_with_single_entry): Remove. (scop_detection::get_nearest_pdom_with_single_exit): Likewise. (scop_detection::merge_sese): Re-implement with a flood-fill algorithm that properly finds a SESE region if it exists. * gcc.dg/graphite/pr83887.c: New testcase. * gfortran.dg/graphite/pr83887.f90: Likewise. * gfortran.dg/graphite/pr83887.f: Likewise. Index: gcc/graphite-scop-detection.c =================================================================== --- gcc/graphite-scop-detection.c (revision 256776) +++ gcc/graphite-scop-detection.c (working copy) @@ -309,16 +309,6 @@ public: sese_l get_sese (loop_p loop); - /* Return the closest dominator with a single entry edge. In case of a - back-loop the back-edge is not counted. */ - - static edge get_nearest_dom_with_single_entry (basic_block dom); - - /* Return the closest post-dominator with a single exit edge. In case of a - back-loop the back-edge is not counted. */ - - static edge get_nearest_pdom_with_single_exit (basic_block dom); - /* Merge scops at same loop depth and returns the new sese. Returns a new SESE when merge was successful, INVALID_SESE otherwise. */ @@ -441,85 +431,6 @@ scop_detection::get_sese (loop_p loop) return sese_l (scop_begin, scop_end); } -/* Return the closest dominator with a single entry edge. */ - -edge -scop_detection::get_nearest_dom_with_single_entry (basic_block dom) -{ - if (!dom->preds) - return NULL; - - /* If any of the dominators has two predecessors but one of them is a back - edge, then that basic block also qualifies as a dominator with single - entry. */ - if (dom->preds->length () == 2) - { - /* If e1->src dominates e2->src then e1->src will also dominate dom. */ - edge e1 = (*dom->preds)[0]; - edge e2 = (*dom->preds)[1]; - loop_p l = dom->loop_father; - loop_p l1 = e1->src->loop_father; - loop_p l2 = e2->src->loop_father; - if (l != l1 && l == l2 - && dominated_by_p (CDI_DOMINATORS, e2->src, e1->src)) - return e1; - if (l != l2 && l == l1 - && dominated_by_p (CDI_DOMINATORS, e1->src, e2->src)) - return e2; - } - - while (dom->preds->length () != 1) - { - if (dom->preds->length () < 1) - return NULL; - dom = get_immediate_dominator (CDI_DOMINATORS, dom); - if (!dom->preds) - return NULL; - } - return (*dom->preds)[0]; -} - -/* Return the closest post-dominator with a single exit edge. In case of a - back-loop the back-edge is not counted. */ - -edge -scop_detection::get_nearest_pdom_with_single_exit (basic_block pdom) -{ - if (!pdom->succs) - return NULL; - - /* If any of the post-dominators has two successors but one of them is a back - edge, then that basic block also qualifies as a post-dominator with single - exit. */ - if (pdom->succs->length () == 2) - { - /* If e1->dest post-dominates e2->dest then e1->dest will also - post-dominate pdom. */ - edge e1 = (*pdom->succs)[0]; - edge e2 = (*pdom->succs)[1]; - loop_p l = pdom->loop_father; - loop_p l1 = e1->dest->loop_father; - loop_p l2 = e2->dest->loop_father; - if (l != l1 && l == l2 - && dominated_by_p (CDI_POST_DOMINATORS, e2->dest, e1->dest)) - return e1; - if (l != l2 && l == l1 - && dominated_by_p (CDI_POST_DOMINATORS, e1->dest, e2->dest)) - return e2; - } - - while (pdom->succs->length () != 1) - { - if (pdom->succs->length () < 1) - return NULL; - pdom = get_immediate_dominator (CDI_POST_DOMINATORS, pdom); - if (!pdom->succs) - return NULL; - } - - return (*pdom->succs)[0]; -} - /* Merge scops at same loop depth and returns the new sese. Returns a new SESE when merge was successful, INVALID_SESE otherwise. */ @@ -537,73 +448,78 @@ scop_detection::merge_sese (sese_l first dp << "[scop-detection] try merging sese s2: "; print_sese (dump_file, second)); - /* Assumption: Both the sese's should be at the same loop depth or one scop - should subsume the other like in case of nested loops. */ - - /* Find the common dominators for entry, - and common post-dominators for the exit. */ - basic_block dom = nearest_common_dominator (CDI_DOMINATORS, - get_entry_bb (first), - get_entry_bb (second)); - - edge entry = get_nearest_dom_with_single_entry (dom); - - if (!entry || (entry->flags & EDGE_IRREDUCIBLE_LOOP)) - return invalid_sese; - - basic_block pdom = nearest_common_dominator (CDI_POST_DOMINATORS, - get_exit_bb (first), - get_exit_bb (second)); - pdom = nearest_common_dominator (CDI_POST_DOMINATORS, dom, pdom); - - edge exit = get_nearest_pdom_with_single_exit (pdom); - - if (!exit || (exit->flags & EDGE_IRREDUCIBLE_LOOP)) - return invalid_sese; - - sese_l combined (entry, exit); - - DEBUG_PRINT (dp << "[scop-detection] checking combined sese: "; - print_sese (dump_file, combined)); + auto_bitmap worklist, in_sese_region; + bitmap_set_bit (worklist, get_entry_bb (first)->index); + bitmap_set_bit (worklist, get_exit_bb (first)->index); + bitmap_set_bit (worklist, get_entry_bb (second)->index); + bitmap_set_bit (worklist, get_exit_bb (second)->index); + edge entry = NULL, exit = NULL; + + /* We can optimize the case of adding a loop entry dest or exit + src to the worklist (for single-exit loops) by skipping + directly to the exit dest / entry src. in_sese_region + doesn't have to cover all blocks in the region but merely + its border it acts more like a visited bitmap. */ + do + { + int index = bitmap_first_set_bit (worklist); + bitmap_clear_bit (worklist, index); + basic_block bb = BASIC_BLOCK_FOR_FN (cfun, index); + edge_iterator ei; + edge e; - /* FIXME: We could iterate to find the dom which dominates pdom, and pdom - which post-dominates dom, until it stabilizes. Also, ENTRY->SRC and - EXIT->DEST should be in the same loop nest. */ - if (!dominated_by_p (CDI_DOMINATORS, pdom, dom) - || entry->src->loop_father != exit->dest->loop_father) - return invalid_sese; - - /* For now we just bail out when there is a loop exit in the region - that is not also the exit of the region. We could enlarge the - region to cover the loop that region exits to. See PR79977. */ - if (loop_outer (entry->src->loop_father)) - { - vec<edge> exits = get_loop_exit_edges (entry->src->loop_father); - for (unsigned i = 0; i < exits.length (); ++i) + /* With fake exit edges we can end up with no possible exit. */ + if (index == EXIT_BLOCK) { - if (exits[i] != exit - && bb_in_region (exits[i]->src, entry->dest, exit->src)) - { - DEBUG_PRINT (dp << "[scop-detection-fail] cannot merge seses.\n"); - exits.release (); - return invalid_sese; - } + DEBUG_PRINT (dp << "[scop-detection-fail] cannot merge seses.\n"); + return invalid_sese; } - exits.release (); + + bitmap_set_bit (in_sese_region, bb->index); + + basic_block dom = get_immediate_dominator (CDI_DOMINATORS, bb); + FOR_EACH_EDGE (e, ei, bb->preds) + if (e->src == dom + && (! entry + || dominated_by_p (CDI_DOMINATORS, entry->dest, bb))) + { + if (entry + && ! bitmap_bit_p (in_sese_region, entry->src->index)) + bitmap_set_bit (worklist, entry->src->index); + entry = e; + } + else if (! bitmap_bit_p (in_sese_region, e->src->index)) + bitmap_set_bit (worklist, e->src->index); + + basic_block pdom = get_immediate_dominator (CDI_POST_DOMINATORS, bb); + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest == pdom + && (! exit + || dominated_by_p (CDI_POST_DOMINATORS, exit->src, bb))) + { + if (exit + && ! bitmap_bit_p (in_sese_region, exit->dest->index)) + bitmap_set_bit (worklist, exit->dest->index); + exit = e; + } + else if (! bitmap_bit_p (in_sese_region, e->dest->index)) + bitmap_set_bit (worklist, e->dest->index); } + while (! bitmap_empty_p (worklist)); - /* For now we just want to bail out when exit does not post-dominate entry. - TODO: We might just add a basic_block at the exit to make exit - post-dominate entry (the entire region). */ - if (!dominated_by_p (CDI_POST_DOMINATORS, get_entry_bb (combined), - get_exit_bb (combined)) - || !dominated_by_p (CDI_DOMINATORS, get_exit_bb (combined), - get_entry_bb (combined))) + /* Include the BB with the loop-closed SSA PHI nodes. + canonicalize_loop_closed_ssa makes sure that is in proper shape. */ + if (exit->dest != EXIT_BLOCK_PTR_FOR_FN (cfun) + && loop_exit_edge_p (exit->src->loop_father, exit)) { - DEBUG_PRINT (dp << "[scop-detection-fail] cannot merge seses.\n"); - return invalid_sese; + gcc_assert (single_pred_p (exit->dest) + && single_succ_p (exit->dest) + && sese_trivially_empty_bb_p (exit->dest)); + exit = single_succ_edge (exit->dest); } + sese_l combined (entry, exit); + DEBUG_PRINT (dp << "[merged-sese] s1: "; print_sese (dump_file, combined)); return combined; Index: gcc/testsuite/gcc.dg/graphite/pr83887.c =================================================================== --- gcc/testsuite/gcc.dg/graphite/pr83887.c (nonexistent) +++ gcc/testsuite/gcc.dg/graphite/pr83887.c (working copy) @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O -floop-nest-optimize -fno-tree-loop-im" } */ + +int z4, g7; + +void +x3 (int my) +{ + while (my < 2) + { + for (z4 = 0; z4 < 2; ++z4) + { + } + + if (my != 0) + for (g7 = 0; g7 < 2; ++g7) + { + } + + ++my; + } +} Index: gcc/testsuite/gfortran.dg/graphite/pr83887.f90 =================================================================== --- gcc/testsuite/gfortran.dg/graphite/pr83887.f90 (nonexistent) +++ gcc/testsuite/gfortran.dg/graphite/pr83887.f90 (working copy) @@ -0,0 +1,59 @@ +! { dg-do compile } +! { dg-options "-O -floop-nest-optimize" } + SUBROUTINE ZTRMM ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, & + B, LDB ) + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + complex(kind((1.0d0,1.0d0))) ALPHA + complex(kind((1.0d0,1.0d0))) A( LDA, * ), B( LDB, * ) + EXTERNAL XERBLA + INTRINSIC CONJG, MAX + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + complex(kind((1.0d0,1.0d0))) TEMP + complex(kind((1.0d0,1.0d0))) ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + complex(kind((1.0d0,1.0d0))) ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) + LSIDE = scan( SIDE , 'Ll' )>0 + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = scan( TRANSA, 'Tt' )>0 + NOUNIT = scan( DIAG , 'Nn' )>0 + UPPER = scan( UPLO , 'Uu' )>0 + INFO = 0 + IF( N.EQ.0 ) & + RETURN + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF + DO 160, J = 1, N + DO 150, I = 1, M + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) & + TEMP = TEMP*A( I, I ) + DO 130, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 130 CONTINUE + ELSE + IF( NOUNIT ) & + TEMP = TEMP*CONJG( A( I, I ) ) + DO 140, K = I + 1, M + TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) + 140 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 150 CONTINUE + 160 CONTINUE + RETURN + END + Index: gcc/testsuite/gfortran.dg/graphite/pr83887.f =================================================================== --- gcc/testsuite/gfortran.dg/graphite/pr83887.f (nonexistent) +++ gcc/testsuite/gfortran.dg/graphite/pr83887.f (working copy) @@ -0,0 +1,23 @@ +! { dg-do compile } +! { dg-options "-O2 -floop-nest-optimize" } + SUBROUTINE STONG(IGAUSS) + DIMENSION EXX(6) + PARAMETER (MXSH=1000, MXGTOT=5000) + COMMON /NSHEL / EX(MXGTOT),CS(MXGTOT),NSHELL + 100 CONTINUE + NSHELL = NSHELL+1 + IF(NSHELL.GT.MXSH) THEN + RETURN + END IF + DO 320 I = 1,IGAUSS + K = K1+I-1 + EX(K) = EXX(I)*SCALE + 320 CONTINUE + IF(TNORM.GT.TOLNRM) THEN + STOP + END IF + DO 460 IG = K1,K2 + CS(IG) = FACS*CS(IG) + 460 CONTINUE + GO TO 100 + END