On Mon, Nov 11, 2013 at 4:18 PM, Jakub Jelinek <ja...@redhat.com> wrote: > On Thu, Oct 10, 2013 at 08:40:05PM +0200, Jan Hubicka wrote: >> --- config/i386/x86-tune.def (revision 203387) >> +++ config/i386/x86-tune.def (working copy) > >> +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are >> + split. */ >> +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, >> "256_unaligned_load_optimal", >> + ~(m_COREI7 | m_GENERIC)) >> + >> +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are > > s/loads/stores/ > >> + split. */ >> +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, >> "256_unaligned_load_optimal", >> + ~(m_COREI7 | m_BDVER | m_GENERIC)) > > s/load/store/ >
We should exclude m_COREI7_AVX since Ivy Bridge needs to split 32-byt unaligned load and store. Also comments should say "if false, unaligned loads are split. " instead of " if true,..." > Also, I wonder if we couldn't improve the generated code for > -mavx2 -mtune=generic or -march=core-avx2 -mtune=generic etc. > - m_GENERIC is included clearly because vmovup{s,d} was really bad > on SandyBridge (am I right here?), but if the ISA includes AVX2, then > the code will not run on that chip at all, so can't we override it? Yes, we should do it, similar to if (TARGET_AVX || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL || optimize_insn_for_size_p ()) { /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_loadupd (op0, op1)); return; } >> @@ -3946,10 +3933,10 @@ ix86_option_override_internal (bool main >> if (flag_expensive_optimizations >> && !(target_flags_explicit & MASK_VZEROUPPER)) >> target_flags |= MASK_VZEROUPPER; >> - if ((x86_avx256_split_unaligned_load & ix86_tune_mask) >> + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] > > Didn't you mean to use X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL here? > >> && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) >> target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; >> - if ((x86_avx256_split_unaligned_store & ix86_tune_mask) >> + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] > > And similarly for STORE here? > >> && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) >> target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; >> /* Enable 128-bit AVX instruction generation > > Jakub Something like this. -- H.J. --- diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 430d562..b8cb871 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3974,10 +3974,10 @@ ix86_option_override_internal (bool main_args_p, if (flag_expensive_optimizations && !(opts_set->x_target_flags & MASK_VZEROUPPER)) opts->x_target_flags |= MASK_VZEROUPPER; - if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] + if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] + if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; /* Enable 128-bit AVX instruction generation @@ -16576,7 +16576,7 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) if (MEM_P (op1)) { - if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD) { rtx r = gen_reg_rtx (mode); m = adjust_address (op1, mode, 0); @@ -16596,7 +16596,7 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) } else if (MEM_P (op0)) { - if (TARGET_AVX256_SPLIT_UNALIGNED_STORE) + if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE) { m = adjust_address (op0, mode, 0); emit_insn (extract (m, op1, const0_rtx)); diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 1a85ce2..6c7063a 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -376,15 +376,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ -/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are split. */ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", - ~(m_COREI7 | m_GENERIC)) + ~(m_COREI7 | m_COREI7_AVX | m_GENERIC)) -/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are split. */ -DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", - ~(m_COREI7 | m_BDVER | m_GENERIC)) +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", + ~(m_COREI7 | m_COREI7_AVX | m_BDVER | m_GENERIC)) /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */