Hi, This patch enables 128-bit avx instruction generation for the auto-vectorizer for AMD bulldozer machines. This enablement gives additional ~3% improvement on polyhedron 2005 and cpu2006 floating point programs.
The patch passed bootstrapping on a x86_64-unknown-linux-gnu system with Bulldozer cores. Is it OK to commit to trunk and backport to 4.6 branch? Thanks, Changpeng
From b5015593b0b30b14783866ac68c2c5f2e014d206 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <chfang@huainan.(none)> Date: Wed, 22 Jun 2011 15:03:05 -0700 Subject: [PATCH] Auto-vectorizer generates 128-bit AVX insns by default for bdver1 * config/i386/i386.opt (mprefer-avx128): Redefine the flag as a Mask option. * config/i386/i386.c (x86_prefer_avx128): New tune option definition. (ix86_option_override_internal): Enable the generation of the 128-bit instructions when x86_prefer_avx128 is set. (ix86_preferred_simd_mode): Use TARGET_PREFER_AVX128. (ix86_autovectorize_vector_sizes): Use TARGET_PREFER_AVX128. --- gcc/config/i386/i386.c | 13 ++++++++++--- gcc/config/i386/i386.opt | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 014401b..1f5113f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2128,6 +2128,9 @@ static const unsigned int x86_avx256_split_unaligned_load static const unsigned int x86_avx256_split_unaligned_store = m_COREI7 | m_BDVER1 | m_GENERIC; +static const unsigned int x86_prefer_avx128 + = m_BDVER1; + /* In case the average insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ @@ -2623,6 +2626,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune, { "-mvzeroupper", MASK_VZEROUPPER }, { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD}, { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE}, + { "-mprefer-avx128", MASK_PREFER_AVX128}, }; const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2]; @@ -3672,6 +3676,9 @@ ix86_option_override_internal (bool main_args_p) if ((x86_avx256_split_unaligned_store & ix86_tune_mask) && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + if ((x86_prefer_avx128 & ix86_tune_mask) + && !(target_flags_explicit & MASK_PREFER_AVX128)) + target_flags |= MASK_PREFER_AVX128; } } else @@ -34614,7 +34621,7 @@ ix86_preferred_simd_mode (enum machine_mode mode) return V2DImode; case SFmode: - if (TARGET_AVX && !flag_prefer_avx128) + if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SFmode; else return V4SFmode; @@ -34622,7 +34629,7 @@ ix86_preferred_simd_mode (enum machine_mode mode) case DFmode: if (!TARGET_VECTORIZE_DOUBLE) return word_mode; - else if (TARGET_AVX && !flag_prefer_avx128) + else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DFmode; else if (TARGET_SSE2) return V2DFmode; @@ -34639,7 +34646,7 @@ ix86_preferred_simd_mode (enum machine_mode mode) static unsigned int ix86_autovectorize_vector_sizes (void) { - return (TARGET_AVX && !flag_prefer_avx128) ? 32 | 16 : 0; + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0; } /* Initialize the GCC target structure. */ diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 21e0def..9886b7b 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -388,7 +388,7 @@ Do dispatch scheduling if processor is bdver1 and Haifa scheduling is selected. mprefer-avx128 -Target Report Var(flag_prefer_avx128) Init(0) +Target Report Mask(PREFER_AVX128) SAVE Use 128-bit AVX instructions instead of 256-bit AVX instructions in the auto-vectorizer. ;; ISA support -- 1.7.0.4