Hi, The patch ( http://gcc.gnu.org/ml/gcc-patches/2011-02/txt00059.txt ) which introduces splitting avx256 unaligned loads. However, we found that it causes significant regressions for cpu2006 ( http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49089 ).
In this work, we introduce a tune option that sets splitting unaligned loads default only for such CPUs that such splitting is beneficial. The patch passed bootstrapping and regression tests on x86_64-unknown-linux-gnu system. Is it OK to commit? Thanks, Changpeng
From 415012803abf2cac95c067394504c55dd968f4f5 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <chfang@huainan.(none)> Date: Mon, 13 Jun 2011 13:13:32 -0700 Subject: [PATCH] pr49089: enable avx256 splitting unaligned load only when beneficial * config/i386/i386.h (ix86_tune_indices): Introduce X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL. (TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL): New definition. * config/i386/i386.c (ix86_tune_features): Add entry for X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL. (ix86_option_override_internal): Enable avx256 unaligned load splitting only when TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL is set. --- gcc/config/i386/i386.c | 9 +++++++-- gcc/config/i386/i386.h | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7b266b9..d5f358f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2088,7 +2088,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching at -O3. For the moment, the prefetching seems badly tuned for Intel chips. */ - m_K6_GEODE | m_AMD_MULTIPLE + m_K6_GEODE | m_AMD_MULTIPLE, + + /* X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL: Enable splitting 256-bit + unaligned load. It hurts the performance on Bulldozer. */ + m_COREI7 }; /* Feature tests against the various architecture variations. */ @@ -4194,7 +4198,8 @@ ix86_option_override_internal (bool main_args_p) if (flag_expensive_optimizations && !(target_flags_explicit & MASK_VZEROUPPER)) target_flags |= MASK_VZEROUPPER; - if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL + && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8badcbb..b2a1bc8 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -312,6 +312,7 @@ enum ix86_tune_indices { X86_TUNE_OPT_AGU, X86_TUNE_VECTORIZE_DOUBLE, X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, + X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL, X86_TUNE_LAST }; @@ -410,6 +411,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE] #define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \ ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] +#define TARGET_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL \ + ix86_tune_features[X86_TUNE_AVX256_SPLIT_UNALIGNED_LOAD_OPTIMAL] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { -- 1.7.0.4