On Tue, 28 Nov 2017, Richard Biener wrote: > > The following adds a new target hook, targetm.vectorize.split_reduction, > which allows the target to specify a preferred mode to perform the > final reducion on using either vector shifts or scalar extractions. > Up to that mode the vector reduction result is reduced by combining > lowparts and highparts recursively. This avoids lane-crossing operations > when doing AVX256 on Zen and Bulldozer and also speeds up things on > Haswell (I verified ~20% speedup on Broadwell). > > Thus the patch implements the target hook on x86 to _always_ prefer > SSE modes for the final reduction. > > For the testcase in the bugzilla > > int sumint(const int arr[]) { > arr = __builtin_assume_aligned(arr, 64); > int sum=0; > for (int i=0 ; i<1024 ; i++) > sum+=arr[i]; > return sum; > } > > this changes -O3 -mavx512f code from > > sumint: > .LFB0: > .cfi_startproc > vpxord %zmm0, %zmm0, %zmm0 > leaq 4096(%rdi), %rax > .p2align 4,,10 > .p2align 3 > .L2: > vpaddd (%rdi), %zmm0, %zmm0 > addq $64, %rdi > cmpq %rdi, %rax > jne .L2 > vpxord %zmm1, %zmm1, %zmm1 > vshufi32x4 $78, %zmm1, %zmm0, %zmm2 > vpaddd %zmm2, %zmm0, %zmm0 > vmovdqa64 .LC0(%rip), %zmm2 > vpermi2d %zmm1, %zmm0, %zmm2 > vpaddd %zmm2, %zmm0, %zmm0 > vmovdqa64 .LC1(%rip), %zmm2 > vpermi2d %zmm1, %zmm0, %zmm2 > vpaddd %zmm2, %zmm0, %zmm0 > vmovdqa64 .LC2(%rip), %zmm2 > vpermi2d %zmm1, %zmm0, %zmm2 > vpaddd %zmm2, %zmm0, %zmm0 > vmovd %xmm0, %eax > > to > > sumint: > .LFB0: > .cfi_startproc > vpxord %zmm0, %zmm0, %zmm0 > leaq 4096(%rdi), %rax > .p2align 4,,10 > .p2align 3 > .L2: > vpaddd (%rdi), %zmm0, %zmm0 > addq $64, %rdi > cmpq %rdi, %rax > jne .L2 > vextracti64x4 $0x1, %zmm0, %ymm1 > vpaddd %ymm0, %ymm1, %ymm1 > vmovdqa %xmm1, %xmm0 > vextracti128 $1, %ymm1, %xmm1 > vpaddd %xmm1, %xmm0, %xmm0 > vpsrldq $8, %xmm0, %xmm1 > vpaddd %xmm1, %xmm0, %xmm0 > vpsrldq $4, %xmm0, %xmm1 > vpaddd %xmm1, %xmm0, %xmm0 > vmovd %xmm0, %eax > > and for -O3 -mavx2 from > > sumint: > .LFB0: > .cfi_startproc > vpxor %xmm0, %xmm0, %xmm0 > leaq 4096(%rdi), %rax > .p2align 4,,10 > .p2align 3 > .L2: > vpaddd (%rdi), %ymm0, %ymm0 > addq $32, %rdi > cmpq %rdi, %rax > jne .L2 > vpxor %xmm1, %xmm1, %xmm1 > vperm2i128 $33, %ymm1, %ymm0, %ymm2 > vpaddd %ymm2, %ymm0, %ymm0 > vperm2i128 $33, %ymm1, %ymm0, %ymm2 > vpalignr $8, %ymm0, %ymm2, %ymm2 > vpaddd %ymm2, %ymm0, %ymm0 > vperm2i128 $33, %ymm1, %ymm0, %ymm1 > vpalignr $4, %ymm0, %ymm1, %ymm1 > vpaddd %ymm1, %ymm0, %ymm0 > vmovd %xmm0, %eax > > to > > sumint: > .LFB0: > .cfi_startproc > vpxor %xmm0, %xmm0, %xmm0 > leaq 4096(%rdi), %rax > .p2align 4,,10 > .p2align 3 > .L2: > vpaddd (%rdi), %ymm0, %ymm0 > addq $32, %rdi > cmpq %rdi, %rax > jne .L2 > vmovdqa %xmm0, %xmm1 > vextracti128 $1, %ymm0, %xmm0 > vpaddd %xmm0, %xmm1, %xmm0 > vpsrldq $8, %xmm0, %xmm1 > vpaddd %xmm1, %xmm0, %xmm0 > vpsrldq $4, %xmm0, %xmm1 > vpaddd %xmm1, %xmm0, %xmm0 > vmovd %xmm0, %eax > vzeroupper > ret > > which besides being faster is also smaller (less prefixes). > > SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely > effects reduction vectorization epilogues I didn't expect big effects > but for loops that do not run much (more likely with AVX512). > > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. > > Ok for trunk?
Ping? Richard. > The PR mentions some more tricks to optimize the sequence but > those look like backend only optimizations. > > Thanks, > Richard. > > 2017-11-28 Richard Biener <rguent...@suse.de> > > PR tree-optimization/80846 > * target.def (split_reduction): New target hook. > * targhooks.c (default_split_reduction): New function. > * targhooks.h (default_split_reduction): Declare. > * tree-vect-loop.c (vect_create_epilog_for_reduction): If the > target requests first reduce vectors by combining low and high > parts. > * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust. > (get_vectype_for_scalar_type_and_size): Export. > * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare. > > * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document. > * doc/tm.texi: Regenerate. > > i386/ > * config/i386/i386.c (ix86_split_reduction): Implement > TARGET_VECTORIZE_SPLIT_REDUCTION. > > * gcc.target/i386/pr80846-1.c: New testcase. > * gcc.target/i386/pr80846-2.c: Likewise. > > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 255197) > +++ gcc/config/i386/i386.c (working copy) > @@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo > } > } > > +/* All CPUs perfer to avoid cross-lane operations so perform reductions > + upper against lower halves up to SSE reg size. */ > + > +static machine_mode > +ix86_split_reduction (machine_mode mode) > +{ > + /* Reduce lowpart against highpart until we reach SSE reg width to > + avoid cross-lane operations. */ > + switch (mode) > + { > + case E_V16SImode: > + case E_V8SImode: > + return V4SImode; > + case E_V32HImode: > + case E_V16HImode: > + return V8HImode; > + case E_V64QImode: > + case E_V32QImode: > + return V16QImode; > + case E_V16SFmode: > + case E_V8SFmode: > + return V4SFmode; > + case E_V8DFmode: > + case E_V4DFmode: > + return V2DFmode; > + default: > + return mode; > + } > +} > + > /* If AVX is enabled then try vectorizing with both 256bit and 128bit > vectors. If AVX512F is enabled then try vectorizing with 512bit, > 256bit and 128bit vectors. */ > @@ -50486,6 +50516,9 @@ ix86_run_selftests (void) > #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE > #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ > ix86_preferred_simd_mode > +#undef TARGET_VECTORIZE_SPLIT_REDUCTION > +#define TARGET_VECTORIZE_SPLIT_REDUCTION \ > + ix86_split_reduction > #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES > #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ > ix86_autovectorize_vector_sizes > Index: gcc/doc/tm.texi > =================================================================== > --- gcc/doc/tm.texi (revision 255197) > +++ gcc/doc/tm.texi (working copy) > @@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v > transformations even in absence of specialized @acronym{SIMD} hardware. > @end deftypefn > > +@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION > (machine_mode) > +This hook should return the preferred mode to split the final reduction > +step on @var{mode} to. The reduction is then carried out reducing upper > +against lower halves of vectors recursively until the specified mode is > +reached. The default is @var{mode} which means no splitting. > +@end deftypefn > + > @deftypefn {Target Hook} {unsigned int} > TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void) > This hook should return a mask of sizes that should be iterated over > after trying to autovectorize using the vector size derived from the > Index: gcc/doc/tm.texi.in > =================================================================== > --- gcc/doc/tm.texi.in (revision 255197) > +++ gcc/doc/tm.texi.in (working copy) > @@ -4091,6 +4091,8 @@ address; but often a machine-dependent > > @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE > > +@hook TARGET_VECTORIZE_SPLIT_REDUCTION > + > @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES > > @hook TARGET_VECTORIZE_GET_MASK_MODE > Index: gcc/target.def > =================================================================== > --- gcc/target.def (revision 255197) > +++ gcc/target.def (working copy) > @@ -1875,6 +1875,17 @@ transformations even in absence of speci > (scalar_mode mode), > default_preferred_simd_mode) > > +/* Returns the preferred mode for splitting SIMD reductions to. */ > +DEFHOOK > +(split_reduction, > + "This hook should return the preferred mode to split the final reduction\n\ > +step on @var{mode} to. The reduction is then carried out reducing upper\n\ > +against lower halves of vectors recursively until the specified mode is\n\ > +reached. The default is @var{mode} which means no splitting.", > + machine_mode, > + (machine_mode), > + default_split_reduction) > + > /* Returns a mask of vector sizes to iterate over when auto-vectorizing > after processing the preferred one derived from preferred_simd_mode. */ > DEFHOOK > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 255197) > +++ gcc/targhooks.c (working copy) > @@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode > return word_mode; > } > > +/* By default do not split reductions further. */ > + > +machine_mode > +default_split_reduction (machine_mode mode) > +{ > + return mode; > +} > + > /* By default only the size derived from the preferred vector mode > is tried. */ > > Index: gcc/targhooks.h > =================================================================== > --- gcc/targhooks.h (revision 255197) > +++ gcc/targhooks.h (working copy) > @@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm > const_tree, > int, bool); > extern machine_mode default_preferred_simd_mode (scalar_mode mode); > +extern machine_mode default_split_reduction (machine_mode); > extern unsigned int default_autovectorize_vector_sizes (void); > extern opt_machine_mode default_get_mask_mode (unsigned, unsigned); > extern void *default_init_cost (struct loop *); > Index: gcc/testsuite/gcc.target/i386/pr80846-1.c > =================================================================== > --- gcc/testsuite/gcc.target/i386/pr80846-1.c (nonexistent) > +++ gcc/testsuite/gcc.target/i386/pr80846-1.c (working copy) > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512f" } */ > + > +int sumint(const int arr[]) { > + arr = __builtin_assume_aligned(arr, 64); > + int sum=0; > + for (int i=0 ; i<1024 ; i++) > + sum+=arr[i]; > + return sum; > +} > + > +/* { dg-final { scan-assembler-times "vextracti" 2 } } */ > Index: gcc/testsuite/gcc.target/i386/pr80846-2.c > =================================================================== > --- gcc/testsuite/gcc.target/i386/pr80846-2.c (nonexistent) > +++ gcc/testsuite/gcc.target/i386/pr80846-2.c (working copy) > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx2" } */ > + > +int sumint(const int arr[]) { > + arr = __builtin_assume_aligned(arr, 64); > + int sum=0; > + for (int i=0 ; i<1024 ; i++) > + sum+=arr[i]; > + return sum; > +} > + > +/* { dg-final { scan-assembler-times "vextracti" 1 } } */ > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 255197) > +++ gcc/tree-vect-loop.c (working copy) > @@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr > } > else > { > - bool reduce_with_shift = have_whole_vector_shift (mode); > - int element_bitsize = tree_to_uhwi (bitsize); > - int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); > + bool reduce_with_shift; > tree vec_temp; > - > + > /* COND reductions all do the final reduction with MAX_EXPR. */ > if (code == COND_EXPR) > code = MAX_EXPR; > > - /* Regardless of whether we have a whole vector shift, if we're > - emulating the operation via tree-vect-generic, we don't want > - to use it. Only the first round of the reduction is likely > - to still be profitable via emulation. */ > - /* ??? It might be better to emit a reduction tree code here, so that > - tree-vect-generic can expand the first round via bit tricks. */ > - if (!VECTOR_MODE_P (mode)) > - reduce_with_shift = false; > + /* See if the target wants to do the final (shift) reduction > + in a vector mode of smaller size and first reduce upper/lower > + halves against each other. */ > + enum machine_mode mode1 = mode; > + tree vectype1 = vectype; > + unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); > + unsigned sz1 = sz; > + if (!slp_reduc > + && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) > + sz1 = GET_MODE_SIZE (mode1); > + > + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); > + reduce_with_shift = have_whole_vector_shift (mode1); > + if (!VECTOR_MODE_P (mode1)) > + reduce_with_shift = false; > else > - { > - optab optab = optab_for_tree_code (code, vectype, optab_default); > - if (optab_handler (optab, mode) == CODE_FOR_nothing) > - reduce_with_shift = false; > - } > + { > + optab optab = optab_for_tree_code (code, vectype1, optab_default); > + if (optab_handler (optab, mode1) == CODE_FOR_nothing) > + reduce_with_shift = false; > + } > + > + /* First reduce the vector to the desired vector size we should > + do shift reduction on by combining upper and lower halves. */ > + new_temp = new_phi_result; > + while (sz > sz1) > + { > + gcc_assert (!slp_reduc); > + sz /= 2; > + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); > + > + /* The target has to make sure we support lowpart/highpart > + extraction, either via direct vector extract or through > + an integer mode punning. */ > + tree dst1, dst2; > + if (convert_optab_handler (vec_extract_optab, > + TYPE_MODE (TREE_TYPE (new_temp)), > + TYPE_MODE (vectype1)) > + != CODE_FOR_nothing) > + { > + /* Extract sub-vectors directly once vec_extract becomes > + a conversion optab. */ > + dst1 = make_ssa_name (vectype1); > + epilog_stmt > + = gimple_build_assign (dst1, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, vectype1, > + new_temp, TYPE_SIZE (vectype1), > + bitsize_int (0))); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + dst2 = make_ssa_name (vectype1); > + epilog_stmt > + = gimple_build_assign (dst2, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, vectype1, > + new_temp, TYPE_SIZE (vectype1), > + bitsize_int (sz * > BITS_PER_UNIT))); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + } > + else > + { > + /* Extract via punning to appropriately sized integer mode > + vector. */ > + tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, > + 1); > + tree etype = build_vector_type (eltype, 2); > + gcc_assert (convert_optab_handler (vec_extract_optab, > + TYPE_MODE (etype), > + TYPE_MODE (eltype)) > + != CODE_FOR_nothing); > + tree tem = make_ssa_name (etype); > + epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, > + build1 (VIEW_CONVERT_EXPR, > + etype, new_temp)); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + new_temp = tem; > + tem = make_ssa_name (eltype); > + epilog_stmt > + = gimple_build_assign (tem, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, eltype, > + new_temp, TYPE_SIZE (eltype), > + bitsize_int (0))); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + dst1 = make_ssa_name (vectype1); > + epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, > + build1 (VIEW_CONVERT_EXPR, > + vectype1, tem)); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + tem = make_ssa_name (eltype); > + epilog_stmt > + = gimple_build_assign (tem, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, eltype, > + new_temp, TYPE_SIZE (eltype), > + bitsize_int (sz * > BITS_PER_UNIT))); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + dst2 = make_ssa_name (vectype1); > + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, > + build1 (VIEW_CONVERT_EXPR, > + vectype1, tem)); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + } > + > + new_temp = make_ssa_name (vectype1); > + epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); > + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); > + } > > if (reduce_with_shift && !slp_reduc) > { > - int nelements = vec_size_in_bits / element_bitsize; > + int nelements = TYPE_VECTOR_SUBPARTS (vectype1); > auto_vec_perm_indices sel (nelements); > > int elt_offset; > > - tree zero_vec = build_zero_cst (vectype); > + tree zero_vec = build_zero_cst (vectype1); > /* Case 2: Create: > for (offset = nelements/2; offset >= 1; offset/=2) > { > @@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr > dump_printf_loc (MSG_NOTE, vect_location, > "Reduce using vector shifts\n"); > > - vec_dest = vect_create_destination_var (scalar_dest, vectype); > - new_temp = new_phi_result; > + mode1 = TYPE_MODE (vectype1); > + vec_dest = vect_create_destination_var (scalar_dest, vectype1); > for (elt_offset = nelements / 2; > elt_offset >= 1; > elt_offset /= 2) > { > sel.truncate (0); > - calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); > - tree mask = vect_gen_perm_mask_any (vectype, sel); > + calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); > + tree mask = vect_gen_perm_mask_any (vectype1, sel); > epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, > new_temp, zero_vec, mask); > new_name = make_ssa_name (vec_dest, epilog_stmt); > @@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr > dump_printf_loc (MSG_NOTE, vect_location, > "Reduce using scalar code.\n"); > > - vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); > + int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); > + int element_bitsize = tree_to_uhwi (bitsize); > FOR_EACH_VEC_ELT (new_phis, i, new_phi) > { > int bit_offset; > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 255197) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve > > mask_elt_type = lang_hooks.types.type_for_mode > (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); > - mask_type = get_vectype_for_scalar_type (mask_elt_type); > + mask_type = get_same_sized_vectype (mask_elt_type, vectype); > > auto_vec<tree, 32> mask_elts (nunits); > for (unsigned int i = 0; i < nunits; ++i) > @@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt) > Returns the vector type corresponding to SCALAR_TYPE and SIZE as > supported > by the target. */ > > -static tree > +tree > get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size) > { > tree orig_scalar_type = scalar_type; > Index: gcc/tree-vectorizer.h > =================================================================== > --- gcc/tree-vectorizer.h (revision 255197) > +++ gcc/tree-vectorizer.h (working copy) > @@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop > /* In tree-vect-stmts.c. */ > extern unsigned int current_vector_size; > extern tree get_vectype_for_scalar_type (tree); > +extern tree get_vectype_for_scalar_type_and_size (tree, unsigned); > extern tree get_mask_type_for_scalar_type (tree); > extern tree get_same_sized_vectype (tree, tree); > extern bool vect_is_simple_use (tree, vec_info *, gimple **, > -- Richard Biener <rguent...@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)