Hi, I added AVX256_SPLIT_UNALIGNED_STORE to ix86_tune_indices and put m_COREI7, m_BDVER1 and m_GENERIC as the targets that enable it.
Is this OK? Thanks, Changpeng ________________________________________ From: H.J. Lu [hjl.to...@gmail.com] Sent: Friday, June 17, 2011 1:08 PM To: Fang, Changpeng Cc: Richard Guenther; gcc-patches@gcc.gnu.org Subject: Re: [PATCH, PR 49089] Don't split AVX256 unaligned loads by default on bdver1 and generic On Fri, Jun 17, 2011 at 10:45 AM, Fang, Changpeng <changpeng.f...@amd.com> wrote: >>Why not just move AVX256_SPLIT_UNALIGNED_STORE >>and AVX256_SPLIT_UNALIGNED_LOAD to ix86_tune_indices? > > I would like to keep the -m option so that at least we can explicitly turn > off the splittings when regressions are found! I prefer to implement it the same way as: x86_accumulate_outgoing_args x86_arch_always_fancy_math_387 > By the way, I can add an index for store splitting, if you want. > Yes, please. -- H.J.
From 91e715213bb37d089cb490e769b115d1d131918f Mon Sep 17 00:00:00 2001 From: Changpeng Fang <chfang@huainan.(none)> Date: Mon, 13 Jun 2011 13:13:32 -0700 Subject: [PATCH 2/2] pr49089: enable avx256 splitting unaligned load/store only when beneficial * config/i386/i386.h (ix86_tune_indices): Introduce X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL and X86_TUNE_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL. (TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL): New definition. (TARGET_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL): New definition. * config/i386/i386.c (ix86_tune_features): Add entries for X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL and X86_TUNE_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL. (ix86_option_override_internal): Enable avx256 unaligned load(store) splitting when TARGET_AVX256_SPLIT_UNALIGNED_LOAD(STORE)_OPTIMAL are set. --- gcc/config/i386/i386.c | 17 ++++++++++++++--- gcc/config/i386/i386.h | 4 ++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7b266b9..b50d349 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2088,7 +2088,16 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching at -O3. For the moment, the prefetching seems badly tuned for Intel chips. */ - m_K6_GEODE | m_AMD_MULTIPLE + m_K6_GEODE | m_AMD_MULTIPLE, + + /* X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL: Enable splitting 256-bit + unaligned load. It hurts the performance on Bulldozer. We need to + re-tune the generic options for current cpus! */ + m_COREI7 | m_GENERIC, + + /* X86_TUNE_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL: Enable splitting 256-bit + unaligned store. */ + m_COREI7 | m_BDVER1 | m_GENERIC }; /* Feature tests against the various architecture variations. */ @@ -4194,9 +4203,11 @@ ix86_option_override_internal (bool main_args_p) if (flag_expensive_optimizations && !(target_flags_explicit & MASK_VZEROUPPER)) target_flags |= MASK_VZEROUPPER; - if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL + && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) + if (TARGET_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL + && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; } } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8badcbb..b6e5570 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -312,6 +312,8 @@ enum ix86_tune_indices { X86_TUNE_OPT_AGU, X86_TUNE_VECTORIZE_DOUBLE, X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, + X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL, + X86_TUNE_AVX256_SPLIT_UNALIGNED_STORE_OPTIMAL, X86_TUNE_LAST }; @@ -410,6 +412,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE] #define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \ ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] +#define TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL \ + ix86_tune_features[X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { -- 1.7.0.4