Hi, Currently, tree unrolling pass(cunroll) does not allow any code size growth in O2 mode. Code size growth is permitted only if O3 or funroll-loops/fpeel-loops is used. I have created a patch to allow partial code size increase in O2 mode. With funroll-loops the maximum allowed code growth is 100 unrolled insns. For partial growth, I experimented with various values of code growth and I have attached SPEC 2006 performance numbers for code growth from 20 to 100 insns in steps of 20.
For this patch, I have set the partial code growth in O2 mode to be 40 insns (tunable via param) where we get performance improvements with minimal code size growth. Perf. data shows good improvements in a few benchmarks. h264, sjeng and bzip2 get >2% improvement. calculix shows a big regression(4.5% on westmere) which I am investigating along with the povray regression. I also ran experiments with -ftree-vectorize turned on with -O2 both in baseline and with the partial unroll to study the effect of unrolling on vectorization. Loop unrolling seems to benefit more benchmarks when vectorization is turned on. I have attached the patch and pdfs of the perf. data. and code size growth. How to read the attached perf data: There are two data files. * spec_perf_O2_unroll.txt contains perf data using unrolling with various code size growth on O2. * spec_perf_O2_vectorize_ unroll.txt contains perf data using unrolling with various code size growth on O2 + ftree-vectorize. Each file contains perf. improvements and code size growth data. Experiments were done on Ibis-sandybridge and Ikaria-westmere. Here is a sample from the file (All perf. numbers are in %): Unroll insns code growth 20 40 60 80 100 _____________________________________________________ spec/2006/fp/C++/444.namd -3.2 -0.13 -0.4 -0.57 -0.31 This data shows that namd regressed by 3.2% over baseline when code size growth was set to 20 insns and regressed by 0.57% over baseline when growth was 80 insns. Please let me know what you think. Thanks Sri
* tree-ssa-loop-ivcanon.c (unroll_level): New enum value UL_PARTIAL. (increase_code_size): New enum. (try_unroll_loop_completely): Check if max unrolled insns is less than the partial growth value when partial growth is set. (tree_unroll_loops_completely_1): Change type of may_increase_size. Set growth to partial when desired. (tree_unroll_loops_completely): Set code growth to partial in O2 mode. (tree_complete_unroll_inner): Rewrite code growth block to use enum. * params.def (PARAM_MAX_DEFAULT_UNROLL_INSNS): New param. Index: params.def =================================================================== --- params.def (revision 205058) +++ params.def (working copy) @@ -304,6 +304,11 @@ DEFPARAM(PARAM_MAX_COMPLETELY_PEELED_INSNS, "max-completely-peeled-insns", "The maximum number of insns of a completely peeled loop", 100, 0, 0) +/* The maximum number of insns in a peeled loop for default unrolling. */ +DEFPARAM(PARAM_MAX_DEFAULT_UNROLL_INSNS, + "max-default-unroll-insns", + "The maximum number of insns for the default tree unrolling", + 40, 0, 0) /* The maximum number of peelings of a single loop that is peeled completely. */ DEFPARAM(PARAM_MAX_COMPLETELY_PEEL_TIMES, "max-completely-peel-times", Index: tree-ssa-loop-ivcanon.c =================================================================== --- tree-ssa-loop-ivcanon.c (revision 205058) +++ tree-ssa-loop-ivcanon.c (working copy) @@ -71,9 +71,18 @@ enum unroll_level iteration. */ UL_NO_GROWTH, /* Only loops whose unrolling will not cause increase of code size. */ + UL_PARTIAL, /* All suitable loops whose unrolling will not + increase code size by more than 50% of UL_ALL. */ UL_ALL /* All suitable loops. */ }; +typedef enum _increase_code_size +{ + UNROLL_NO_INCREASE = 0, + UNROLL_PARTIAL_INCREASE = 1, + UNROLL_FULL_INCREASE = 2 +} increase_code_size; + /* Adds a canonical induction variable to LOOP iterating NITER times. EXIT is the exit edge whose condition is replaced. */ @@ -651,6 +660,7 @@ try_unroll_loop_completely (struct loop *loop, location_t locus) { unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns; + unsigned HOST_WIDE_INT max_unroll_insns; gimple cond; struct loop_size size; bool n_unroll_found = false; @@ -696,6 +706,10 @@ try_unroll_loop_completely (struct loop *loop, return false; max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES); + max_unroll_insns = (ul != UL_PARTIAL) ? + PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) : + PARAM_VALUE (PARAM_MAX_DEFAULT_UNROLL_INSNS); + if (n_unroll > max_unroll) return false; @@ -805,8 +819,7 @@ try_unroll_loop_completely (struct loop *loop, loop->num); return false; } - else if (unr_insns - > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS)) + else if (unr_insns > max_unroll_insns) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Not unrolling loop %d: " @@ -1100,7 +1113,8 @@ propagate_constants_for_unrolling (basic_block bb) loop we unrolled. */ static bool -tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer, +tree_unroll_loops_completely_1 (increase_code_size may_increase_size, + bool unroll_outer, vec<loop_p, va_heap>& father_stack, struct loop *loop) { @@ -1135,7 +1149,7 @@ static bool /* Unroll outermost loops only if asked to do so or they do not cause code growth. */ && (unroll_outer || loop_outer (loop_father))) - ul = UL_ALL; + ul = (may_increase_size == UNROLL_PARTIAL_INCREASE) ? UL_PARTIAL : UL_ALL; else ul = UL_NO_GROWTH; @@ -1163,7 +1177,8 @@ static bool size of the code does not increase. */ unsigned int -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer) +tree_unroll_loops_completely (increase_code_size may_increase_size, + bool unroll_outer) { stack_vec<loop_p, 16> father_stack; bool changed; @@ -1308,12 +1323,19 @@ make_pass_iv_canon (gcc::context *ctxt) static unsigned int tree_complete_unroll (void) { + increase_code_size code_size; + if (number_of_loops (cfun) <= 1) return 0; - return tree_unroll_loops_completely (flag_unroll_loops - || flag_peel_loops - || optimize >= 3, true); + if (flag_unroll_loops || flag_peel_loops || (optimize >= 3)) + code_size = UNROLL_FULL_INCREASE; + else if (optimize == 2) + code_size = UNROLL_PARTIAL_INCREASE; + else + code_size = UNROLL_NO_INCREASE; + + return tree_unroll_loops_completely (code_size, true); } static bool @@ -1366,13 +1388,20 @@ static unsigned int tree_complete_unroll_inner (void) { unsigned ret = 0; + increase_code_size code_size; loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); if (number_of_loops (cfun) > 1) { scev_initialize (); - ret = tree_unroll_loops_completely (optimize >= 3, false); + + if (optimize >= 3) + code_size = UNROLL_FULL_INCREASE; + else + code_size = UNROLL_NO_INCREASE; + + ret = tree_unroll_loops_completely (code_size, false); free_numbers_of_iterations_estimates (); scev_finalize (); }
O2 mode, all numbers in %, positive is good and negative is bad. IBIS SANDYBRIDGE Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd 0.18 0.26 0.04 0.44 0.31 spec/2006/fp/C++/450.soplex 0.57 0.29 -0.38 -0.62 -0.19 spec/2006/fp/C++/453.povray -0.52 -2.04 0.19 -0.5 1.52 spec/2006/fp/C-F/436.cactusADM -0.18 0.58 0.04 0.44 -0.4 spec/2006/fp/C-F/454.calculix -4.3 -2.49 -1.89 -1.13 -0.15 spec/2006/fp/C/433.milc -0.38 0.8 0.51 3.07 2.78 spec/2006/fp/C/470.lbm -0.25 0.46 0.46 0.18 -0.13 spec/2006/fp/C/482.sphinx3 -0.81 -1.02 -1.02 -0.74 -0.4 spec/2006/fp/F/410.bwaves -3.59 -0.21 0.21 -0.36 -0.32 spec/2006/fp/F/434.zeusmp -0.13 0.04 0.18 0.36 0.13 spec/2006/int/C++/473.astar -0.75 -0.47 -0.37 -0.14 -0.42 spec/2006/int/C/400.perlbench -0.52 -0.52 -0.2 0.87 -0.61 spec/2006/int/C/401.bzip2 1.13 2.44 1.57 1.18 1.13 spec/2006/int/C/403.gcc -0.31 0.15 0.03 0.4 -0.12 spec/2006/int/C/429.mcf -0.87 -0.07 -0.28 -0.05 -0.45 spec/2006/int/C/445.gobmk -0.3 0.42 1.63 0.42 1.21 spec/2006/int/C/456.hmmer 0.46 0 0.3 0.17 0.21 spec/2006/int/C/458.sjeng 0.68 2.04 0.83 1.4 1.77 spec/2006/int/C/462.libquantum 0.1 -0.15 0.94 -0.58 -0.52 spec/2006/int/C/464.h264ref -0.06 2.17 2.47 2.97 -0.41 geometric mean -0.5 0.13 0.26 0.38 0.24 IKARIA WESTMERE Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd 0.13 0.19 0.13 0.19 0.19 spec/2006/fp/C++/450.soplex 1.35 0.96 -0.99 0.07 1.81 spec/2006/fp/C++/453.povray 0.36 -1 -1.18 -1.04 0.18 spec/2006/fp/C-F/436.cactusADM 0.64 0.18 0.37 0 0.37 spec/2006/fp/C-F/454.calculix -9.5 -4.5 -3.8 -2.6 -1.4 spec/2006/fp/C/433.milc 0.72 1.13 1.95 4.68 4.73 spec/2006/fp/C/470.lbm 0.06 0.09 0.03 0.23 0 spec/2006/fp/C/482.sphinx3 -1.5 -0.72 -0.07 -1.6 -1.5 spec/2006/fp/F/410.bwaves -0.59 -0.95 0 -0.59 -0.59 spec/2006/fp/F/434.zeusmp 0.4 0.13 0.07 -0.2 0.13 spec/2006/int/C++/473.astar -0.21 -0.28 -0.28 0 -1.25 spec/2006/int/C/400.perlbench -0.62 -0.12 0 0 0 spec/2006/int/C/401.bzip2 0.3 1.16 -0.67 -0.55 -0.73 spec/2006/int/C/403.gcc 0.9 -0.6 0.15 -0.05 -0.4 spec/2006/int/C/429.mcf -1.02 0.39 0.2 -0.63 -0.33 spec/2006/int/C/445.gobmk -0.15 -0.35 0.75 0.7 1.04 spec/2006/int/C/456.hmmer 0.22 0.05 0.05 -0.05 0.33 spec/2006/int/C/458.sjeng 0.73 0.93 0.64 1.62 1.91 spec/2006/int/C/462.libquantum -0.47 -0.74 -1.08 -0.08 -0.26 spec/2006/int/C/464.h264ref 0.5 2.15 2.58 3.04 2.91 geometric mean -0.41 -0.1 -0.06 0.15 0.35 TEXT SIZE INCREASE Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd 0 -0.01 -0.01 -0.02 -0.01 spec/2006/fp/C++/450.soplex 0 0 0 0 0 spec/2006/fp/C++/453.povray -0.14 -0.34 -0.42 -0.62 -0.73 spec/2006/fp/C-F/436.cactusADM -0.01 -0.06 -0.07 -0.09 -0.14 spec/2006/fp/C-F/454.calculix 1.89 1.54 1.33 1.09 0.88 spec/2006/fp/C/433.milc -0.15 -0.37 -0.56 -0.8 -0.83 spec/2006/fp/C/470.lbm 0 0 0 0 0 spec/2006/fp/C/482.sphinx3 0 -0.01 -0.01 -0.01 -0.01 spec/2006/fp/F/410.bwaves 0.14 0.12 0.12 0.12 0.12 spec/2006/fp/F/434.zeusmp 0.27 0.15 0.1 0.03 0 spec/2006/int/C++/473.astar 0 0 0 0 -0.1 spec/2006/int/C/400.perlbench -0.01 -0.04 -0.05 -0.05 -0.07 spec/2006/int/C/401.bzip2 -0.07 -0.16 -0.22 -0.22 -0.32 spec/2006/int/C/403.gcc -0.12 -0.27 -0.36 -0.43 -0.45 spec/2006/int/C/429.mcf 0 0 0 0 0 spec/2006/int/C/445.gobmk -0.1 -0.39 -0.78 -0.97 -1.23 spec/2006/int/C/456.hmmer -0.02 -0.14 -0.2 -0.31 -0.31 spec/2006/int/C/458.sjeng -0.06 -1.64 -1.79 -1.95 -2.07 spec/2006/int/C/462.libquantum -0.02 -0.03 -0.03 -0.03 -0.1 spec/2006/int/C/464.h264ref -0.38 -1.54 -3.16 -4.04 -5.3 total size 0.14% -0.07% -0.23% -0.35% -0.48% geo mean 0.06% -0.16% -0.31% -0.42% -0.54%
O2 + -ftree-vectorize, all numbers in %, positive is good and negative is bad. IBIS SANDYBRIDGE Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd -3.2 -0.13 -0.4 -0.57 -0.31 spec/2006/fp/C++/450.soplex 0.43 1.76 -1.48 -0.14 -0.48 spec/2006/fp/C++/453.povray -2.33 0.36 -2.42 -0.55 -1.1 spec/2006/fp/C- F/436.cactusADM 0.73 -0.1 0.19 -0.67 -0.35 spec/2006/fp/C-F/454.calculix -5.24 -1.6 -0.91 -0.76 1.37 spec/2006/fp/C/433.milc 0.59 0.59 0.34 3.23 1.97 spec/2006/fp/C/470.lbm -1.2 0.36 0.23 0.03 -0.15 spec/2006/fp/C/482.sphinx3 -0.76 -0.24 -0.36 -0.17 -0.02 spec/2006/fp/F/410.bwaves 4.24 3.29 4.64 4.75 4.93 spec/2006/fp/F/434.zeusmp -0.6 0.43 -0.22 -0.34 -0.56 spec/2006/int/C++/473.astar -0.61 0.19 -1.12 -0.84 -2.38 spec/2006/int/C/400.perlbench -1.02 -2.37 -0.29 -0.2 1.11 spec/2006/int/C/401.bzip2 0.04 0.78 1.69 1.17 1.99 spec/2006/int/C/403.gcc -0.03 -0.37 -0.79 -0.43 -0.34 spec/2006/int/C/429.mcf -0.35 0.19 0 -0.66 0.14 spec/2006/int/C/445.gobmk 0.64 1.31 1.35 0.94 1.39 spec/2006/int/C/456.hmmer -0.32 0.36 0.44 0.32 0.56 spec/2006/int/C/458.sjeng 0.29 0.66 0.91 1.31 1.64 spec/2006/int/C/462.libquantum -2.08 -1.06 -0.9 -1.42 -1.18 spec/2006/int/C/464.h264ref 0.38 1.28 2.4 2.61 3.06 geometric mean -0.54 0.28 0.15 0.37 0.55 IKARIA WESTMERE Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd -0.06 -0.06 -0.06 -0.13 0.06 spec/2006/fp/C++/450.soplex 1.49 1.03 0.85 2.49 0.14 spec/2006/fp/C++/453.povray -1.17 -0.54 -0.23 -0.09 -0.63 spec/2006/fp/C- F/436.cactusADM 0.58 0.45 -0.32 -0.19 0.06 spec/2006/fp/C-F/454.calculix -12.51 -4.9 -4.9 -4.4 -0.1 spec/2006/fp/C/433.milc 0.05 2.1 2.05 4.15 3.69 spec/2006/fp/C/470.lbm 0.15 0.06 0.67 -0.09 0.06 spec/2006/fp/C/482.sphinx3 1.03 0.93 0.3 0.53 -0.66 spec/2006/fp/F/410.bwaves 0.83 0.65 1.67 1.07 1.13 spec/2006/fp/F/434.zeusmp 0.07 -0.21 -0.14 0.21 0.28 spec/2006/int/C++/473.astar 0.42 -0.28 0.14 0.56 0.07 spec/2006/int/C/400.perlbench -0.08 -1.25 0.21 0.17 -0.33 spec/2006/int/C/401.bzip2 0.43 -1.4 1.83 1.22 0.73 spec/2006/int/C/403.gcc -0.4 -0.25 0.9 -0.2 0.7 spec/2006/int/C/429.mcf 0.33 3.1 1.13 3.3 1.03 spec/2006/int/C/445.gobmk 0.25 0.2 1.09 0.74 1.04 spec/2006/int/C/456.hmmer 0.36 0.26 0.21 0.16 0.26 spec/2006/int/C/458.sjeng 0.14 2.01 1.44 1.92 2.25 spec/2006/int/C/462.libquantum -1.2 -0.08 -0.96 1.12 0.5 spec/2006/int/C/464.h264ref 0.16 1.89 2.18 2.63 2.15 geometric mean -0.5 0.17 0.39 0.74 0.62 Text Size Unroll insns code growth 20 40 60 80 100 spec/2006/fp/C++/444.namd 0 -0.11 -0.11 -0.12 -0.11 spec/2006/fp/C++/450.soplex -0.03 -0.04 -0.14 -0.14 -0.14 spec/2006/fp/C++/453.povray -0.13 -0.37 -0.47 -0.75 -0.9 spec/2006/fp/C- F/436.cactusADM -0.06 -0.53 -0.61 -0.72 -0.79 spec/2006/fp/C-F/454.calculix 2.18 1.56 1.3 1.01 0.77 spec/2006/fp/C/433.milc -0.13 -0.42 -0.55 -0.74 -0.82 spec/2006/fp/C/470.lbm 0 0 0 0 0 spec/2006/fp/C/482.sphinx3 -0.05 -0.1 -0.12 -0.15 -0.15 spec/2006/fp/F/410.bwaves 0.26 0.22 0.22 0.22 0.22 spec/2006/fp/F/434.zeusmp 0.51 0.33 0.28 0.09 0 spec/2006/int/C++/473.astar 0 -0.01 -0.03 -0.03 -0.2 spec/2006/int/C/400.perlbench -0.01 -0.05 -0.18 -0.27 -0.33 spec/2006/int/C/401.bzip2 -0.14 -0.38 -0.62 -0.62 -0.62 spec/2006/int/C/403.gcc -0.13 -0.3 -0.44 -0.55 -0.6 spec/2006/int/C/429.mcf 0 0 0 0 0 spec/2006/int/C/445.gobmk -0.14 -0.52 -0.98 -1.15 -1.45 spec/2006/int/C/456.hmmer -0.11 -0.24 -0.35 -0.51 -0.56 spec/2006/int/C/458.sjeng -0.01 -1.59 -1.74 -1.9 -2.02 spec/2006/int/C/462.libquantum -0.02 -0.04 -0.04 -0.04 -0.12 spec/2006/int/C/464.h264ref -0.35 -1.56 -3.03 -3.84 -4.78 total size 0.18% -0.13% -0.32% -0.48% -0.61% geo mean 0.08% -0.20% -0.38% -0.51% -0.63%