The following is a revision of the original patch in that it also constrains versioning for aliasing with the cheap cost model (formerly -ftree-vect-loop-version only constrained versioning for alignment). It also changes vectorizer related gates to rely on global_options_set instead of playing magic games with flag values (that won't extend to vectorization being enabled by default at -O2 with a different default cost model). It also disables if-conversion at -O2 as the current way that works makes code changes to non-vectorized code as well.
I've done some measurements with that (also with the previous patch, not quoted here), plus [3/n] that just changes -ftree-vectorize to be enabled at -O2. Thus, SPEC 2k6 CPU -O2 -fno-tree-vectorize vs. -O2 on x86_64 SandyBridge (but without any extra -m flag). Basically checking what differences we can expect in distribution builds. > du -k */exe/* 1212 400.perlbench/exe/perlbench_base.amd64-m64-gcc42-nn 1216 400.perlbench/exe/perlbench_peak.amd64-m64-gcc42-nn 76 401.bzip2/exe/bzip2_base.amd64-m64-gcc42-nn 80 401.bzip2/exe/bzip2_peak.amd64-m64-gcc42-nn 3632 403.gcc/exe/gcc_base.amd64-m64-gcc42-nn 3640 403.gcc/exe/gcc_peak.amd64-m64-gcc42-nn 48 410.bwaves/exe/bwaves_base.amd64-m64-gcc42-nn 60 410.bwaves/exe/bwaves_peak.amd64-m64-gcc42-nn 8952 416.gamess/exe/gamess_base.amd64-m64-gcc42-nn 9556 416.gamess/exe/gamess_peak.amd64-m64-gcc42-nn 28 429.mcf/exe/mcf_base.amd64-m64-gcc42-nn 28 429.mcf/exe/mcf_peak.amd64-m64-gcc42-nn 148 433.milc/exe/milc_base.amd64-m64-gcc42-nn 148 433.milc/exe/milc_peak.amd64-m64-gcc42-nn 280 434.zeusmp/exe/zeusmp_base.amd64-m64-gcc42-nn 348 434.zeusmp/exe/zeusmp_peak.amd64-m64-gcc42-nn 1096 435.gromacs/exe/gromacs_base.amd64-m64-gcc42-nn 1104 435.gromacs/exe/gromacs_peak.amd64-m64-gcc42-nn 804 436.cactusADM/exe/cactusADM_base.amd64-m64-gcc42-nn 932 436.cactusADM/exe/cactusADM_peak.amd64-m64-gcc42-nn 132 437.leslie3d/exe/leslie3d_base.amd64-m64-gcc42-nn 204 437.leslie3d/exe/leslie3d_peak.amd64-m64-gcc42-nn 340 444.namd/exe/namd_base.amd64-m64-gcc42-nn 348 444.namd/exe/namd_peak.amd64-m64-gcc42-nn 3956 445.gobmk/exe/gobmk_base.amd64-m64-gcc42-nn 3960 445.gobmk/exe/gobmk_peak.amd64-m64-gcc42-nn 4012 447.dealII/exe/dealII_base.amd64-m64-gcc42-nn 4136 447.dealII/exe/dealII_peak.amd64-m64-gcc42-nn 468 450.soplex/exe/soplex_base.amd64-m64-gcc42-nn 476 450.soplex/exe/soplex_peak.amd64-m64-gcc42-nn 1152 453.povray/exe/povray_base.amd64-m64-gcc42-nn 1156 453.povray/exe/povray_peak.amd64-m64-gcc42-nn 1796 454.calculix/exe/calculix_base.amd64-m64-gcc42-nn 1824 454.calculix/exe/calculix_peak.amd64-m64-gcc42-nn 324 456.hmmer/exe/hmmer_base.amd64-m64-gcc42-nn 328 456.hmmer/exe/hmmer_peak.amd64-m64-gcc42-nn 160 458.sjeng/exe/sjeng_base.amd64-m64-gcc42-nn 164 458.sjeng/exe/sjeng_peak.amd64-m64-gcc42-nn 432 459.GemsFDTD/exe/GemsFDTD_base.amd64-m64-gcc42-nn 576 459.GemsFDTD/exe/GemsFDTD_peak.amd64-m64-gcc42-nn 68 462.libquantum/exe/libquantum_base.amd64-m64-gcc42-nn 68 462.libquantum/exe/libquantum_peak.amd64-m64-gcc42-nn 572 464.h264ref/exe/h264ref_base.amd64-m64-gcc42-nn 576 464.h264ref/exe/h264ref_peak.amd64-m64-gcc42-nn 4488 465.tonto/exe/tonto_base.amd64-m64-gcc42-nn 4580 465.tonto/exe/tonto_peak.amd64-m64-gcc42-nn 28 470.lbm/exe/lbm_base.amd64-m64-gcc42-nn 28 470.lbm/exe/lbm_peak.amd64-m64-gcc42-nn 784 471.omnetpp/exe/omnetpp_base.amd64-m64-gcc42-nn 784 471.omnetpp/exe/omnetpp_peak.amd64-m64-gcc42-nn 60 473.astar/exe/astar_base.amd64-m64-gcc42-nn 64 473.astar/exe/astar_peak.amd64-m64-gcc42-nn 4460 481.wrf/exe/wrf_base.amd64-m64-gcc42-nn 5332 481.wrf/exe/wrf_peak.amd64-m64-gcc42-nn 208 482.sphinx3/exe/sphinx_livepretend_base.amd64-m64-gcc42-nn 212 482.sphinx3/exe/sphinx_livepretend_peak.amd64-m64-gcc42-nn 5660 483.xalancbmk/exe/Xalan_base.amd64-m64-gcc42-nn 5668 483.xalancbmk/exe/Xalan_peak.amd64-m64-gcc42-nn 12 998.specrand/exe/specrand_base.amd64-m64-gcc42-nn 12 998.specrand/exe/specrand_peak.amd64-m64-gcc42-nn 12 999.specrand/exe/specrand_base.amd64-m64-gcc42-nn 12 999.specrand/exe/specrand_peak.amd64-m64-gcc42-nn (serial make) > grep 'Elapsed compile ' /abuild/rguenther/spec2k6/result/CPU2006.497.log Elapsed compile for '400.perlbench': 00:00:28 (28) Elapsed compile for '401.bzip2': 00:00:05 (5) Elapsed compile for '403.gcc': 00:01:07 (67) Elapsed compile for '429.mcf': 00:00:03 (3) Elapsed compile for '445.gobmk': 00:00:21 (21) Elapsed compile for '456.hmmer': 00:00:11 (11) Elapsed compile for '458.sjeng': 00:00:06 (6) Elapsed compile for '462.libquantum': 00:00:05 (5) Elapsed compile for '464.h264ref': 00:00:15 (15) Elapsed compile for '471.omnetpp': 00:00:29 (29) Elapsed compile for '473.astar': 00:00:04 (4) Elapsed compile for '483.xalancbmk': 00:03:02 (182) Elapsed compile for '999.specrand': 00:00:03 (3) Elapsed compile for '410.bwaves': 00:00:04 (4) Elapsed compile for '416.gamess': 00:03:14 (194) Elapsed compile for '433.milc': 00:00:07 (7) Elapsed compile for '434.zeusmp': 00:00:11 (11) Elapsed compile for '435.gromacs': 00:00:25 (25) Elapsed compile for '436.cactusADM': 00:00:23 (23) Elapsed compile for '437.leslie3d': 00:00:06 (6) Elapsed compile for '444.namd': 00:00:10 (10) Elapsed compile for '447.dealII': 00:02:00 (120) Elapsed compile for '450.soplex': 00:00:24 (24) Elapsed compile for '453.povray': 00:00:28 (28) Elapsed compile for '454.calculix': 00:00:47 (47) Elapsed compile for '459.GemsFDTD': 00:00:15 (15) Elapsed compile for '465.tonto': 00:01:41 (101) Elapsed compile for '470.lbm': 00:00:03 (3) Elapsed compile for '481.wrf': 00:02:08 (128) Elapsed compile for '482.sphinx3': 00:00:08 (8) Elapsed compile for '998.specrand': 00:00:03 (3) Elapsed compile for '400.perlbench': 00:00:28 (28) Elapsed compile for '401.bzip2': 00:00:05 (5) Elapsed compile for '403.gcc': 00:01:07 (67) Elapsed compile for '429.mcf': 00:00:04 (4) Elapsed compile for '445.gobmk': 00:00:22 (22) Elapsed compile for '456.hmmer': 00:00:11 (11) Elapsed compile for '458.sjeng': 00:00:06 (6) Elapsed compile for '462.libquantum': 00:00:04 (4) Elapsed compile for '464.h264ref': 00:00:15 (15) Elapsed compile for '471.omnetpp': 00:00:30 (30) Elapsed compile for '473.astar': 00:00:04 (4) Elapsed compile for '483.xalancbmk': 00:03:02 (182) Elapsed compile for '999.specrand': 00:00:03 (3) Elapsed compile for '410.bwaves': 00:00:04 (4) Elapsed compile for '416.gamess': 00:03:31 (211) Elapsed compile for '433.milc': 00:00:07 (7) Elapsed compile for '434.zeusmp': 00:00:13 (13) Elapsed compile for '435.gromacs': 00:00:26 (26) Elapsed compile for '436.cactusADM': 00:00:27 (27) Elapsed compile for '437.leslie3d': 00:00:08 (8) Elapsed compile for '444.namd': 00:00:11 (11) Elapsed compile for '447.dealII': 00:02:03 (123) Elapsed compile for '450.soplex': 00:00:24 (24) Elapsed compile for '453.povray': 00:00:28 (28) Elapsed compile for '454.calculix': 00:00:48 (48) Elapsed compile for '459.GemsFDTD': 00:00:21 (21) Elapsed compile for '465.tonto': 00:01:42 (102) Elapsed compile for '470.lbm': 00:00:04 (4) Elapsed compile for '481.wrf': 00:02:31 (151) Elapsed compile for '482.sphinx3': 00:00:07 (7) Elapsed compile for '998.specrand': 00:00:03 (3) > grep VECTORIZED /abuild/rguenther/spec2k6/result/CPU2006.497.log | wc -l 9374 > grep 'VECTORIZED\|Compile for.*started' /abuild/rguenther/spec2k6/result/CPU2006.497.log | sed -e 's/^.*VECTORIZED.*$/VECTORIZED/' | uniq -c 1 Compile for '400.perlbench' started at: Wed May 15 17:22:50 2013 (1368631370) 24 VECTORIZED 1 Compile for '401.bzip2' started at: Wed May 15 17:23:18 2013 (1368631398) 19 VECTORIZED 1 Compile for '403.gcc' started at: Wed May 15 17:23:24 2013 (1368631404) 91 VECTORIZED 1 Compile for '429.mcf' started at: Wed May 15 17:24:31 2013 (1368631471) 1 Compile for '445.gobmk' started at: Wed May 15 17:24:35 2013 (1368631475) 44 VECTORIZED 1 Compile for '456.hmmer' started at: Wed May 15 17:24:57 2013 (1368631497) 40 VECTORIZED 1 Compile for '458.sjeng' started at: Wed May 15 17:25:08 2013 (1368631508) 5 VECTORIZED 1 Compile for '462.libquantum' started at: Wed May 15 17:25:14 2013 (1368631514) 2 VECTORIZED 1 Compile for '464.h264ref' started at: Wed May 15 17:25:19 2013 (1368631519) 97 VECTORIZED 1 Compile for '471.omnetpp' started at: Wed May 15 17:25:34 2013 (1368631534) 4 VECTORIZED 1 Compile for '473.astar' started at: Wed May 15 17:26:04 2013 (1368631564) 5 VECTORIZED 1 Compile for '483.xalancbmk' started at: Wed May 15 17:26:10 2013 (1368631570) 76 VECTORIZED 1 Compile for '999.specrand' started at: Wed May 15 17:29:12 2013 (1368631752) 1 Compile for '410.bwaves' started at: Wed May 15 17:29:16 2013 (1368631756) 7 VECTORIZED 1 Compile for '416.gamess' started at: Wed May 15 17:29:22 2013 (1368631762) 2764 VECTORIZED 1 Compile for '433.milc' started at: Wed May 15 17:32:53 2013 (1368631973) 9 VECTORIZED 1 Compile for '434.zeusmp' started at: Wed May 15 17:33:00 2013 (1368631980) 111 VECTORIZED 1 Compile for '435.gromacs' started at: Wed May 15 17:33:13 2013 (1368631993) 72 VECTORIZED 1 Compile for '436.cactusADM' started at: Wed May 15 17:33:39 2013 (1368632019) 243 VECTORIZED 1 Compile for '437.leslie3d' started at: Wed May 15 17:34:06 2013 (1368632046) 179 VECTORIZED 1 Compile for '444.namd' started at: Wed May 15 17:34:14 2013 (1368632054) 22 VECTORIZED 1 Compile for '447.dealII' started at: Wed May 15 17:34:26 2013 (1368632066) 2152 VECTORIZED 1 Compile for '450.soplex' started at: Wed May 15 17:36:30 2013 (1368632190) 25 VECTORIZED 1 Compile for '453.povray' started at: Wed May 15 17:36:54 2013 (1368632214) 43 VECTORIZED 1 Compile for '454.calculix' started at: Wed May 15 17:37:23 2013 (1368632243) 358 VECTORIZED 1 Compile for '459.GemsFDTD' started at: Wed May 15 17:38:11 2013 (1368632291) 312 VECTORIZED 1 Compile for '465.tonto' started at: Wed May 15 17:38:33 2013 (1368632313) 439 VECTORIZED 1 Compile for '470.lbm' started at: Wed May 15 17:40:15 2013 (1368632415) 1 Compile for '481.wrf' started at: Wed May 15 17:40:20 2013 (1368632420) 2210 VECTORIZED 1 Compile for '482.sphinx3' started at: Wed May 15 17:42:52 2013 (1368632572) 21 VECTORIZED 1 Compile for '998.specrand' started at: Wed May 15 17:43:00 2013 (1368632580) Estimated Estimated Base Base Base Peak Peak Peak Benchmarks Ref. Run Time Ratio Ref. Run Time Ratio -------------- ------ --------- --------- ------ --------- --------- 400.perlbench 9770 310 31.5 * 9770 308 31.7 * 401.bzip2 9650 460 21.0 * 9650 459 21.0 * 403.gcc 8050 304 26.4 * 8050 296 27.2 * 429.mcf 9120 235 38.7 * 9120 235 38.9 * 445.gobmk 10490 398 26.3 * 10490 395 26.6 * 456.hmmer 9330 390 23.9 * 9330 372 25.1 * 458.sjeng 12100 467 25.9 * 12100 462 26.2 * 462.libquantum 20720 376 55.1 * 20720 377 54.9 * 464.h264ref 22130 559 39.6 * 22130 557 39.7 * 471.omnetpp 6250 273 22.9 * 6250 267 23.4 * 473.astar 7020 391 18.0 * 7020 399 17.6 * 483.xalancbmk 6900 211 32.6 * 6900 216 32.0 * Est. SPECint_base2006 -- Est. SPECint2006 -- Estimated Estimated Base Base Base Peak Peak Peak Benchmarks Ref. Run Time Ratio Ref. Run Time Ratio -------------- ------ --------- --------- ------ --------- --------- 410.bwaves 13590 332 41.0 * 13590 331 41.0 * 416.gamess NR NR 433.milc 9180 487 18.9 * 9180 491 18.7 * 434.zeusmp 9100 461 19.7 * 9100 441 20.6 * 435.gromacs 7140 475 15.0 * 7140 475 15.0 * 436.cactusADM 11950 891 13.4 * 11950 579 20.7 * 437.leslie3d 9400 391 24.1 * 9400 320 29.4 * 444.namd 8020 393 20.4 * 8020 393 20.4 * 447.dealII 11440 310 36.9 * 11440 339 33.8 * 450.soplex 8340 212 39.3 * 8340 221 37.7 * 453.povray 5320 182 29.2 * 5320 182 29.2 * 454.calculix 8250 706 11.7 * 8250 741 11.1 * 459.GemsFDTD 10610 380 27.9 * 10610 366 29.0 * 465.tonto 9840 362 27.2 * 9840 361 27.3 * 470.lbm 13740 269 51.1 * 13740 269 51.2 * 481.wrf 11170 469 23.8 * 11170 335 33.3 * 482.sphinx3 19490 529 36.8 * 19490 531 36.7 * Est. SPECfp_base2006 -- Est. SPECfp2006 -- Compile-time correlates somewhat with the number of vectorized loops (up to 2 extra loops are created), bigger offenders would need to be investigated for inherent vectorizer slowness (I didn't see anything obvious there looking at polyhedron). Runtime effects are in the noise for SPEC INT and show significant improvements for 436.cactusADM, 437.leslie3d and 481.wrf while slowdowns for 447.dealII, 450.soplex and 454.calculix (observed slowdowns are bigger when you compare -O2 vs. -O2 -ftree-vectorize unpatched). Slowdowns hint at cost-model issues and/or cost-model checking overhead (I have patches and further ideas here). Bootstrapped / tested on x86_64-unknown-linux-gnu. Comments? Especially to the new -fvectorizer-cost-model= interface (I can see enabling vectorization at -O2 by default is non-obvious)? I'm currently running another comparison only vectorizing loops where the cost model check can be performed at compile-time. Further restricting vectorization to loops where no prologue/epilogue loops are necessary would also be possible, but that leaves only trivial loops with constant bounds to be vectorized. Thanks, Richard. 2013-05-14 Richard Biener <rguent...@suse.de> common/ * config/i386/i386-common.c (ix86_option_init_struct): Do not enable OPT_fvect_cost_model. * common.opt (fvect-cost-model=): New option. (vect_cost_model): New enum and values. (fvect-cost-model): Alias to -fvect-cost-model=dynamic. (fno-vect-cost-model): Alias to -fvect-cost-model=unlimited. (ftree-vect-loop-version): Ignore. * opts.c (default_options_table): Do not set OPT_fvect_cost_model. (common_handle_option): Likewise. * flag-types.h (enum vect_cost_model): New enum. * doc/invoke.texi (ftree-vect-loop-version): Remove. (fvect-cost-model): Adjust documentation. * targhooks.c (default_add_stmt_cost): Do not check flag_vect_cost_model. * tree-vectorizer.h (struct _loop_vec_info): Add cost model field. (struct _bb_vec_info): Likewise. (vectorizer_cost_model): Declare. * tree-vect-data-refs.c (vect_peeling_hash_insert): Check the loops cost-model flag. (vect_peeling_hash_choose_best_peeling): Likewise. (vect_enhance_data_refs_alignment): Likewise. Do not check flag_tree_vect_loop_version but check the cost model. (vect_mark_for_runtime_alias_test): Do not add runtime alias checks for the cheap cost model. * tree-vect-loop.c (vect_analyze_loop): Initialize the loops cost model flag. (vect_estimate_min_profitable_iters): Use the loops cost model flag. * tree-vect-slp.c (vect_slp_analyze_bb_1): Initialize and use the BBs cost model flag. * tree-vectorizer.c (gate_vect_slp): Enable SLP via the vectorizer only at -O3. (vectorizer_cost_model): Return the active cost model. * Makefile.in (tree-if-conv.o): Depend on $(TREE_VECTORIZER_H). * tree-if-conv.c: Include tree-vectorizer.h. (gate_tree_if_conversion): Enable if-conversion via the vectorizer only at -O3. Index: trunk/gcc/common.opt =================================================================== *** trunk.orig/gcc/common.opt 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/common.opt 2013-05-15 14:43:01.046249528 +0200 *************** EnumValue *** 1304,1310 **** Enum(stack_reuse_level) String(none) Value(SR_NONE) ftree-loop-if-convert ! Common Report Var(flag_tree_loop_if_convert) Init(-1) Optimization Convert conditional jumps in innermost loops to branchless equivalents ftree-loop-if-convert-stores --- 1304,1310 ---- Enum(stack_reuse_level) String(none) Value(SR_NONE) ftree-loop-if-convert ! Common Report Var(flag_tree_loop_if_convert) Optimization Convert conditional jumps in innermost loops to branchless equivalents ftree-loop-if-convert-stores *************** Common RejectNegative Joined UInteger Va *** 2267,2282 **** -ftree-vectorizer-verbose=<number> This switch is deprecated. Use -fopt-info instead. ftree-slp-vectorize ! Common Report Var(flag_tree_slp_vectorize) Init(2) Optimization Enable basic block vectorization (SLP) on trees fvect-cost-model ! Common Report Var(flag_vect_cost_model) Optimization ! Enable use of cost model in vectorization ftree-vect-loop-version ! Common Report Var(flag_tree_vect_loop_version) Init(1) Optimization ! Enable loop versioning when doing loop vectorization on trees ftree-scev-cprop Common Report Var(flag_tree_scev_cprop) Init(1) Optimization --- 2267,2302 ---- -ftree-vectorizer-verbose=<number> This switch is deprecated. Use -fopt-info instead. ftree-slp-vectorize ! Common Report Var(flag_tree_slp_vectorize) Optimization Enable basic block vectorization (SLP) on trees + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) + Specifies the cost model for vectorization + + Enum + Name(vect_cost_model) Type(enum vect_cost_model) UnknownError(unknown vectorizer cost model %qs) + + EnumValue + Enum(vect_cost_model) String(unlimited) Value(VECT_COST_MODEL_UNLIMITED) + + EnumValue + Enum(vect_cost_model) String(dynamic) Value(VECT_COST_MODEL_DYNAMIC) + + EnumValue + Enum(vect_cost_model) String(cheap) Value(VECT_COST_MODEL_CHEAP) + fvect-cost-model ! Common RejectNegative Alias(fvect-cost-model=,dynamic) ! Enables the dynamic vectorizer cost model. Preserved for backward compatibility. ! ! fno-vect-cost-model ! Common RejectNegative Alias(fvect-cost-model=,unlimited) ! Enables the unlimited vectorizer cost model. Preserved for backward compatibility. ftree-vect-loop-version ! Common Ignore ! Does nothing. Preserved for backward compatibility. ftree-scev-cprop Common Report Var(flag_tree_scev_cprop) Init(1) Optimization Index: trunk/gcc/opts.c =================================================================== *** trunk.orig/gcc/opts.c 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/opts.c 2013-05-15 14:47:03.820005049 +0200 *************** static const struct default_options defa *** 498,504 **** { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_vectorize, NULL, 1 }, - { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 }, --- 498,503 ---- *************** common_handle_option (struct gcc_options *** 1597,1604 **** opts->x_flag_gcse_after_reload = value; if (!opts_set->x_flag_tree_vectorize) opts->x_flag_tree_vectorize = value; - if (!opts_set->x_flag_vect_cost_model) - opts->x_flag_vect_cost_model = value; if (!opts_set->x_flag_tree_loop_distribute_patterns) opts->x_flag_tree_loop_distribute_patterns = value; break; --- 1596,1601 ---- Index: trunk/gcc/common/config/i386/i386-common.c =================================================================== *** trunk.orig/gcc/common/config/i386/i386-common.c 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/common/config/i386/i386-common.c 2013-05-15 13:24:32.139871020 +0200 *************** ix86_option_init_struct (struct gcc_opti *** 729,735 **** opts->x_flag_pcc_struct_return = 2; opts->x_flag_asynchronous_unwind_tables = 2; - opts->x_flag_vect_cost_model = 1; } /* On the x86 -fsplit-stack and -fstack-protector both use the same --- 729,734 ---- Index: trunk/gcc/flag-types.h =================================================================== *** trunk.orig/gcc/flag-types.h 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/flag-types.h 2013-05-15 13:24:32.139871020 +0200 *************** enum fp_contract_mode { *** 191,194 **** --- 191,202 ---- FP_CONTRACT_FAST = 2 }; + /* Vectorizer cost-model. */ + enum vect_cost_model { + VECT_COST_MODEL_UNLIMITED = 0, + VECT_COST_MODEL_CHEAP = 1, + VECT_COST_MODEL_DYNAMIC = 2, + VECT_COST_MODEL_DEFAULT = 3 + }; + #endif /* ! GCC_FLAG_TYPES_H */ Index: trunk/gcc/targhooks.c =================================================================== *** trunk.orig/gcc/targhooks.c 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/targhooks.c 2013-05-15 13:24:32.140871032 +0200 *************** default_add_stmt_cost (void *data, int c *** 1050,1070 **** { unsigned *cost = (unsigned *) data; unsigned retval = 0; ! if (flag_vect_cost_model) ! { ! tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; ! int stmt_cost = default_builtin_vectorization_cost (kind, vectype, ! misalign); ! /* Statements in an inner loop relative to the loop being ! vectorized are weighted more heavily. The value here is ! arbitrary and could potentially be improved with analysis. */ ! if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) ! count *= 50; /* FIXME. */ ! ! retval = (unsigned) (count * stmt_cost); ! cost[where] += retval; ! } return retval; } --- 1050,1066 ---- { unsigned *cost = (unsigned *) data; unsigned retval = 0; + tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; + int stmt_cost = default_builtin_vectorization_cost (kind, vectype, + misalign); + /* Statements in an inner loop relative to the loop being + vectorized are weighted more heavily. The value here is + arbitrary and could potentially be improved with analysis. */ + if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) + count *= 50; /* FIXME. */ ! retval = (unsigned) (count * stmt_cost); ! cost[where] += retval; return retval; } Index: trunk/gcc/tree-vect-data-refs.c =================================================================== *** trunk.orig/gcc/tree-vect-data-refs.c 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/tree-vect-data-refs.c 2013-05-15 13:24:32.142871055 +0200 *************** vect_mark_for_runtime_alias_test (ddr_p *** 173,179 **** { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) return false; if (dump_enabled_p ()) --- 173,180 ---- { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ! if (loop_vinfo->cost_model == VECT_COST_MODEL_CHEAP ! || (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) return false; if (dump_enabled_p ()) *************** vect_peeling_hash_insert (loop_vec_info *** 1087,1093 **** *new_slot = slot; } ! if (!supportable_dr_alignment && !flag_vect_cost_model) slot->count += VECT_MAX_COST; } --- 1088,1095 ---- *new_slot = slot; } ! if (!supportable_dr_alignment ! && loop_vinfo->cost_model == VECT_COST_MODEL_UNLIMITED) slot->count += VECT_MAX_COST; } *************** vect_peeling_hash_choose_best_peeling (l *** 1197,1203 **** res.peel_info.dr = NULL; res.body_cost_vec = stmt_vector_for_cost(); ! if (flag_vect_cost_model) { res.inside_cost = INT_MAX; res.outside_cost = INT_MAX; --- 1199,1205 ---- res.peel_info.dr = NULL; res.body_cost_vec = stmt_vector_for_cost(); ! if (loop_vinfo->cost_model != VECT_COST_MODEL_UNLIMITED) { res.inside_cost = INT_MAX; res.outside_cost = INT_MAX; *************** vect_enhance_data_refs_alignment (loop_v *** 1426,1432 **** vectorization factor. We do this automtically for cost model, since we calculate cost for every peeling option. */ ! if (!flag_vect_cost_model) possible_npeel_number = vf /nelements; /* Handle the aligned case. We may decide to align some other --- 1428,1434 ---- vectorization factor. We do this automtically for cost model, since we calculate cost for every peeling option. */ ! if (loop_vinfo->cost_model == VECT_COST_MODEL_UNLIMITED) possible_npeel_number = vf /nelements; /* Handle the aligned case. We may decide to align some other *************** vect_enhance_data_refs_alignment (loop_v *** 1434,1440 **** if (DR_MISALIGNMENT (dr) == 0) { npeel_tmp = 0; ! if (!flag_vect_cost_model) possible_npeel_number++; } --- 1436,1442 ---- if (DR_MISALIGNMENT (dr) == 0) { npeel_tmp = 0; ! if (loop_vinfo->cost_model == VECT_COST_MODEL_UNLIMITED) possible_npeel_number++; } *************** vect_enhance_data_refs_alignment (loop_v *** 1743,1749 **** /* (2) Versioning to force alignment. */ /* Try versioning if: ! 1) flag_tree_vect_loop_version is TRUE 2) optimize loop for speed 3) there is at least one unsupported misaligned data ref with an unknown misalignment, and --- 1745,1751 ---- /* (2) Versioning to force alignment. */ /* Try versioning if: ! 1) cost model is not VECT_COST_MODEL_CHEAP 2) optimize loop for speed 3) there is at least one unsupported misaligned data ref with an unknown misalignment, and *************** vect_enhance_data_refs_alignment (loop_v *** 1751,1757 **** 5) the number of runtime alignment checks is within reason. */ do_versioning = ! flag_tree_vect_loop_version && optimize_loop_nest_for_speed_p (loop) && (!loop->inner); /* FORNOW */ --- 1753,1759 ---- 5) the number of runtime alignment checks is within reason. */ do_versioning = ! loop_vinfo->cost_model != VECT_COST_MODEL_CHEAP && optimize_loop_nest_for_speed_p (loop) && (!loop->inner); /* FORNOW */ Index: trunk/gcc/tree-vect-loop.c =================================================================== *** trunk.orig/gcc/tree-vect-loop.c 2013-05-15 13:22:16.000000000 +0200 --- trunk/gcc/tree-vect-loop.c 2013-05-15 13:24:32.144871077 +0200 *************** vect_analyze_loop (struct loop *loop) *** 1761,1766 **** --- 1761,1768 ---- return NULL; } + loop_vinfo->cost_model = vectorizer_cost_model (); + if (vect_analyze_loop_2 (loop_vinfo)) { LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; *************** vect_estimate_min_profitable_iters (loop *** 2634,2640 **** void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); /* Cost model disabled. */ ! if (!flag_vect_cost_model) { dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled."); *ret_min_profitable_niters = 0; --- 2636,2642 ---- void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); /* Cost model disabled. */ ! if (loop_vinfo->cost_model == VECT_COST_MODEL_UNLIMITED) { dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled."); *ret_min_profitable_niters = 0; Index: trunk/gcc/tree-vect-slp.c =================================================================== *** trunk.orig/gcc/tree-vect-slp.c 2013-05-15 13:22:16.000000000 +0200 --- trunk/gcc/tree-vect-slp.c 2013-05-15 13:24:32.145871088 +0200 *************** vect_slp_analyze_bb_1 (basic_block bb) *** 1992,1997 **** --- 1992,2001 ---- if (!bb_vinfo) return NULL; + /* For BB vectorization it only matters whether the cost model is + enabled or disabled. */ + bb_vinfo->cost_model = vectorizer_cost_model (); + if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf)) { if (dump_enabled_p ()) *************** vect_slp_analyze_bb_1 (basic_block bb) *** 2093,2099 **** } /* Cost model: check if the vectorization is worthwhile. */ ! if (flag_vect_cost_model && !vect_bb_vectorization_profitable_p (bb_vinfo)) { if (dump_enabled_p ()) --- 2097,2103 ---- } /* Cost model: check if the vectorization is worthwhile. */ ! if (bb_vinfo->cost_model != VECT_COST_MODEL_UNLIMITED && !vect_bb_vectorization_profitable_p (bb_vinfo)) { if (dump_enabled_p ()) Index: trunk/gcc/tree-vectorizer.c =================================================================== *** trunk.orig/gcc/tree-vectorizer.c 2013-05-15 13:22:16.000000000 +0200 --- trunk/gcc/tree-vectorizer.c 2013-05-15 14:33:38.359862554 +0200 *************** LOC vect_location; *** 73,78 **** --- 73,93 ---- /* Vector mapping GIMPLE stmt to stmt_vec_info. */ vec<vec_void_p> stmt_vec_info_vec; + /* Return the active vectorizer cost model. */ + + enum vect_cost_model + vectorizer_cost_model (void) + { + if (flag_vect_cost_model != VECT_COST_MODEL_DEFAULT) + return flag_vect_cost_model; + /* If -ftree-vectorize is specified explicitely or enabled by using -O3 + then use the dynamic model, otherwise the cheap one. */ + if (global_options_set.x_flag_tree_vectorize + || (flag_tree_vectorize != 0 && optimize == 3)) + return VECT_COST_MODEL_DYNAMIC; + else + return VECT_COST_MODEL_CHEAP; + } /* Function vectorize_loops. *************** execute_vect_slp (void) *** 191,200 **** static bool gate_vect_slp (void) { ! /* Apply SLP either if the vectorizer is on and the user didn't specify ! whether to run SLP or not, or if the SLP flag was set by the user. */ ! return ((flag_tree_vectorize != 0 && flag_tree_slp_vectorize != 0) ! || flag_tree_slp_vectorize == 1); } struct gimple_opt_pass pass_slp_vectorize = --- 206,220 ---- static bool gate_vect_slp (void) { ! /* Apply SLP either according to whether the user specified whether to ! run SLP or not, or according to whether the user specified whether ! to do vectorization or not. */ ! if (global_options_set.x_flag_tree_slp_vectorize) ! return flag_tree_slp_vectorize != 0; ! if (global_options_set.x_flag_tree_vectorize) ! return flag_tree_vectorize != 0; ! /* And if vectorization was enabled by default run SLP only at -O3. */ ! return flag_tree_vectorize != 0 && optimize == 3; } struct gimple_opt_pass pass_slp_vectorize = Index: trunk/gcc/tree-vectorizer.h =================================================================== *** trunk.orig/gcc/tree-vectorizer.h 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/tree-vectorizer.h 2013-05-15 13:24:32.146871099 +0200 *************** typedef struct _loop_vec_info { *** 314,319 **** --- 314,322 ---- fix it up. */ bool operands_swapped; + /* The cost model to be used for this loop. */ + enum vect_cost_model cost_model; + } *loop_vec_info; /* Access Functions. */ *************** typedef struct _bb_vec_info { *** 391,396 **** --- 394,402 ---- /* Cost data used by the target cost model. */ void *target_cost_data; + /* The cost model to be used for this BB. */ + enum vect_cost_model cost_model; + } *bb_vec_info; #define BB_VINFO_BB(B) (B)->bb *************** void vect_pattern_recog (loop_vec_info, *** 1010,1014 **** --- 1016,1021 ---- /* In tree-vectorizer.c. */ unsigned vectorize_loops (void); + enum vect_cost_model vectorizer_cost_model (void); #endif /* GCC_TREE_VECTORIZER_H */ Index: trunk/gcc/doc/invoke.texi =================================================================== *** trunk.orig/gcc/doc/invoke.texi 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/doc/invoke.texi 2013-05-15 13:24:32.149871132 +0200 *************** Objective-C and Objective-C++ Dialects}. *** 419,428 **** -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol -ftree-switch-conversion -ftree-tail-merge @gol ! -ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol -funit-at-a-time -funroll-all-loops -funroll-loops @gol -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol ! -fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol -fwhole-program -fwpa -fuse-ld=@var{linker} -fuse-linker-plugin @gol --param @var{name}=@var{value} -O -O0 -O1 -O2 -O3 -Os -Ofast -Og} --- 419,428 ---- -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol -ftree-switch-conversion -ftree-tail-merge @gol ! -ftree-ter -ftree-vectorize -ftree-vrp @gol -funit-at-a-time -funroll-all-loops -funroll-loops @gol -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol ! -fvariable-expansion-in-unroller -fvect-cost-model=@var{model} -fvpt -fweb @gol -fwhole-program -fwpa -fuse-ld=@var{linker} -fuse-linker-plugin @gol --param @var{name}=@var{value} -O -O0 -O1 -O2 -O3 -Os -Ofast -Og} *************** Optimize yet more. @option{-O3} turns o *** 6649,6655 **** by @option{-O2} and also turns on the @option{-finline-functions}, @option{-funswitch-loops}, @option{-fpredictive-commoning}, @option{-fgcse-after-reload}, @option{-ftree-vectorize}, - @option{-fvect-cost-model}, @option{-ftree-partial-pre} and @option{-fipa-cp-clone} options. @item -O0 --- 6649,6654 ---- *************** optimizations designed to reduce code si *** 6666,6672 **** @option{-Os} disables the following optimization flags: @gccoptlist{-falign-functions -falign-jumps -falign-loops @gol -falign-labels -freorder-blocks -freorder-blocks-and-partition @gol ! -fprefetch-loop-arrays -ftree-vect-loop-version} @item -Ofast @opindex Ofast --- 6665,6671 ---- @option{-Os} disables the following optimization flags: @gccoptlist{-falign-functions -falign-jumps -falign-loops @gol -falign-labels -freorder-blocks -freorder-blocks-and-partition @gol ! -fprefetch-loop-arrays} @item -Ofast @opindex Ofast *************** Perform loop vectorization on trees. Thi *** 7907,7925 **** Perform basic block vectorization on trees. This flag is enabled by default at @option{-O3} and when @option{-ftree-vectorize} is enabled. ! @item -ftree-vect-loop-version ! @opindex ftree-vect-loop-version ! Perform loop versioning when doing loop vectorization on trees. When a loop ! appears to be vectorizable except that data alignment or data dependence cannot ! be determined at compile time, then vectorized and non-vectorized versions of ! the loop are generated along with run-time checks for alignment or dependence ! to control which version is executed. This option is enabled by default ! except at level @option{-Os} where it is disabled. ! ! @item -fvect-cost-model @opindex fvect-cost-model ! Enable cost model for vectorization. This option is enabled by default at ! @option{-O3}. @item -ftree-vrp @opindex ftree-vrp --- 7906,7925 ---- Perform basic block vectorization on trees. This flag is enabled by default at @option{-O3} and when @option{-ftree-vectorize} is enabled. ! @item -fvect-cost-model=@var{model} @opindex fvect-cost-model ! Alter the cost model used for vectorization. The @var{model} argument ! should be one of @code{unlimited}, @code{dynamic} or @code{cheap}. ! With the @code{unlimited} model the vectorized code-path is assumed ! to be profitable while with the @code{dynamic} model a runtime check ! will guard the vectorized code-path to enable it only for iteration ! counts that will likely execute faster than when executing the original ! scalar loop. The @code{cheap} model will disable vectorization of ! loops where doing so would be cost prohibitive for example due to ! required runtime checks for data dependence or alignment but otherwise ! is equal to the @code{dynamic} model. ! The default cost model depends on other optimization flags and is ! either @code{dynamic} or @code{cheap}. @item -ftree-vrp @opindex ftree-vrp *************** constraints. The default value is 0. *** 9325,9337 **** @item vect-max-version-for-alignment-checks The maximum number of run-time checks that can be performed when ! doing loop versioning for alignment in the vectorizer. See option ! @option{-ftree-vect-loop-version} for more information. @item vect-max-version-for-alias-checks The maximum number of run-time checks that can be performed when ! doing loop versioning for alias in the vectorizer. See option ! @option{-ftree-vect-loop-version} for more information. @item max-iterations-to-track The maximum number of iterations of a loop the brute-force algorithm --- 9325,9335 ---- @item vect-max-version-for-alignment-checks The maximum number of run-time checks that can be performed when ! doing loop versioning for alignment in the vectorizer. @item vect-max-version-for-alias-checks The maximum number of run-time checks that can be performed when ! doing loop versioning for alias in the vectorizer. @item max-iterations-to-track The maximum number of iterations of a loop the brute-force algorithm Index: trunk/gcc/Makefile.in =================================================================== *** trunk.orig/gcc/Makefile.in 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/Makefile.in 2013-05-15 13:24:32.150871143 +0200 *************** tree-nested.o: tree-nested.c $(CONFIG_H) *** 2428,2434 **** tree-if-conv.o: tree-if-conv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ $(TREE_H) $(FLAGS_H) $(BASIC_BLOCK_H) $(TREE_FLOW_H) \ $(CFGLOOP_H) $(TREE_DATA_REF_H) $(TREE_PASS_H) $(DIAGNOSTIC_H) \ ! $(DBGCNT_H) $(GIMPLE_PRETTY_PRINT_H) tree-iterator.o : tree-iterator.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) \ coretypes.h $(GGC_H) tree-iterator.h $(GIMPLE_H) gt-tree-iterator.h tree-dfa.o : tree-dfa.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \ --- 2428,2434 ---- tree-if-conv.o: tree-if-conv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ $(TREE_H) $(FLAGS_H) $(BASIC_BLOCK_H) $(TREE_FLOW_H) \ $(CFGLOOP_H) $(TREE_DATA_REF_H) $(TREE_PASS_H) $(DIAGNOSTIC_H) \ ! $(DBGCNT_H) $(GIMPLE_PRETTY_PRINT_H) $(TREE_VECTORIZER_H) tree-iterator.o : tree-iterator.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) \ coretypes.h $(GGC_H) tree-iterator.h $(GIMPLE_H) gt-tree-iterator.h tree-dfa.o : tree-dfa.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \ Index: trunk/gcc/tree-if-conv.c =================================================================== *** trunk.orig/gcc/tree-if-conv.c 2013-05-15 13:21:54.000000000 +0200 --- trunk/gcc/tree-if-conv.c 2013-05-15 14:42:47.816099456 +0200 *************** along with GCC; see the file COPYING3. *** 95,100 **** --- 95,101 ---- #include "tree-scalar-evolution.h" #include "tree-pass.h" #include "dbgcnt.h" + #include "tree-vectorizer.h" /* List of basic blocks in if-conversion-suitable order. */ static basic_block *ifc_bbs; *************** main_tree_if_conversion (void) *** 1848,1856 **** static bool gate_tree_if_conversion (void) { ! return ((flag_tree_vectorize && flag_tree_loop_if_convert != 0) ! || flag_tree_loop_if_convert == 1 ! || flag_tree_loop_if_convert_stores == 1); } struct gimple_opt_pass pass_if_conversion = --- 1849,1865 ---- static bool gate_tree_if_conversion (void) { ! /* If the option was explicitely specified enable the pass according ! to that. */ ! if (global_options_set.x_flag_tree_loop_if_convert ! || global_options_set.x_flag_tree_loop_if_convert_stores) ! return flag_tree_loop_if_convert || flag_tree_loop_if_convert_stores; ! /* Otherwise when vectorization was enabled/disabled explicitely, ! enable according to that. */ ! if (global_options_set.x_flag_tree_vectorize) ! return flag_tree_vectorize != 0; ! /* And if vectorization was enabled by default run only at -O3. */ ! return flag_tree_vectorize != 0 && optimize == 3; } struct gimple_opt_pass pass_if_conversion =