This avoids spilling SSE registers to memory just because we access vector components in a C array way. The trick is to simply rewrite those accesses to proper vector selects on the tree level and promote the vector to SSA form.
Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2011-03-15 Richard Guenther <rguent...@suse.de> PR tree-optimization/48037 * tree-ssa.c (maybe_rewrite_mem_ref_base): Rewrite vector selects into BIT_FIELD_REFs. (non_rewritable_mem_ref_base): Check if a MEM_REF is a vector select. * gcc.target/i386/pr48037-1.c: New testcase. Index: gcc/tree-ssa.c =================================================================== *** gcc/tree-ssa.c (revision 170776) --- gcc/tree-ssa.c (working copy) *************** maybe_rewrite_mem_ref_base (tree *tp) *** 1838,1855 **** tp = &TREE_OPERAND (*tp, 0); if (TREE_CODE (*tp) == MEM_REF && TREE_CODE (TREE_OPERAND (*tp, 0)) == ADDR_EXPR - && integer_zerop (TREE_OPERAND (*tp, 1)) && (sym = TREE_OPERAND (TREE_OPERAND (*tp, 0), 0)) && DECL_P (sym) && !TREE_ADDRESSABLE (sym) && symbol_marked_for_renaming (sym)) { ! if (!useless_type_conversion_p (TREE_TYPE (*tp), ! TREE_TYPE (sym))) ! *tp = build1 (VIEW_CONVERT_EXPR, ! TREE_TYPE (*tp), sym); ! else ! *tp = sym; } } --- 1838,1869 ---- tp = &TREE_OPERAND (*tp, 0); if (TREE_CODE (*tp) == MEM_REF && TREE_CODE (TREE_OPERAND (*tp, 0)) == ADDR_EXPR && (sym = TREE_OPERAND (TREE_OPERAND (*tp, 0), 0)) && DECL_P (sym) && !TREE_ADDRESSABLE (sym) && symbol_marked_for_renaming (sym)) { ! if (TREE_CODE (TREE_TYPE (sym)) == VECTOR_TYPE ! && useless_type_conversion_p (TREE_TYPE (*tp), ! TREE_TYPE (TREE_TYPE (sym))) ! && multiple_of_p (sizetype, TREE_OPERAND (*tp, 1), ! TYPE_SIZE_UNIT (TREE_TYPE (*tp)))) ! { ! *tp = build3 (BIT_FIELD_REF, TREE_TYPE (*tp), sym, ! TYPE_SIZE (TREE_TYPE (*tp)), ! int_const_binop (MULT_EXPR, ! bitsize_int (BITS_PER_UNIT), ! TREE_OPERAND (*tp, 1), 0)); ! } ! else if (integer_zerop (TREE_OPERAND (*tp, 1))) ! { ! if (!useless_type_conversion_p (TREE_TYPE (*tp), ! TREE_TYPE (sym))) ! *tp = build1 (VIEW_CONVERT_EXPR, ! TREE_TYPE (*tp), sym); ! else ! *tp = sym; ! } } } *************** non_rewritable_mem_ref_base (tree ref) *** 1869,1879 **** base = TREE_OPERAND (base, 0); /* But watch out for MEM_REFs we cannot lower to a ! VIEW_CONVERT_EXPR. */ if (TREE_CODE (base) == MEM_REF && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR) { tree decl = TREE_OPERAND (TREE_OPERAND (base, 0), 0); if (DECL_P (decl) && (!integer_zerop (TREE_OPERAND (base, 1)) || (DECL_SIZE (decl) --- 1883,1900 ---- base = TREE_OPERAND (base, 0); /* But watch out for MEM_REFs we cannot lower to a ! VIEW_CONVERT_EXPR or a BIT_FIELD_REF. */ if (TREE_CODE (base) == MEM_REF && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR) { tree decl = TREE_OPERAND (TREE_OPERAND (base, 0), 0); + if (TREE_CODE (TREE_TYPE (decl)) == VECTOR_TYPE + && useless_type_conversion_p (TREE_TYPE (base), + TREE_TYPE (TREE_TYPE (decl))) + && double_int_fits_in_uhwi_p (mem_ref_offset (base)) + && multiple_of_p (sizetype, TREE_OPERAND (base, 1), + TYPE_SIZE_UNIT (TREE_TYPE (base)))) + return NULL_TREE; if (DECL_P (decl) && (!integer_zerop (TREE_OPERAND (base, 1)) || (DECL_SIZE (decl) Index: gcc/testsuite/gcc.target/i386/pr48037-1.c =================================================================== *** gcc/testsuite/gcc.target/i386/pr48037-1.c (revision 0) --- gcc/testsuite/gcc.target/i386/pr48037-1.c (revision 0) *************** *** 0 **** --- 1,15 ---- + /* { dg-do compile } */ + /* { dg-require-effective-target lp64 } */ + /* { dg-options "-O -fno-math-errno" } */ + + typedef double __m128d __attribute__((vector_size(16))); + __m128d vsqrt1 (__m128d const x) + { + double const* __restrict__ const y = (double const*)&x; + double const a = __builtin_sqrt(y[0]); + double const b = __builtin_sqrt(y[1]); + return (__m128d) { a, b }; + } + + /* Verify we do not spill x to the stack. */ + /* { dg-final { scan-assembler-not "%rsp" } } */