Ping. Thanks, Wei.
On Wed, Apr 9, 2014 at 5:18 PM, Wei Mi <w...@google.com> wrote: > Hi, > > For the testcase 1.c > > #include <emmintrin.h> > > double a[1000]; > > __m128d foo1() { > __m128d res; > res = _mm_load_sd(&a[1]); > res = _mm_loadh_pd(res, &a[2]); > return res; > } > > llvm will merge movsd/movhpd to movupd while gcc will not. The merge > is beneficial on x86 machines starting from Nehalem. > > The patch is to add the merging in peephole. > bootstrap and regression pass. Is it ok for stage1? > > Thanks, > Wei. > > gcc/ChangeLog: > > 2014-04-09 Wei Mi <w...@google.com> > > * config/i386/i386.c (get_memref_parts): New function. > (adjacent_mem_locations): Ditto. > * config/i386/i386-protos.h: Add decl for adjacent_mem_locations. > * config/i386/sse.md: Add define_peephole rule. > > gcc/testsuite/ChangeLog: > > 2014-04-09 Wei Mi <w...@google.com> > > * gcc.target/i386/sse2-unaligned-mov.c: New test. > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 6e32978..3ae0d6d 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule; > #endif > > extern const char * ix86_output_call_insn (rtx insn, rtx call_op); > +extern bool adjacent_mem_locations (rtx mem1, rtx mem2); > > #ifdef RTX_CODE > /* Target data for multipass lookahead scheduling. > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 3eefe4a..a330e84 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold, > tree *clear, tree *update) > atomic_feraiseexcept_call); > } > > +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM. > + Return true if successful, false if all the values couldn't > + be determined. > + > + This function only looks for REG/SYMBOL or REG/SYMBOL+CONST > + address forms. */ > + > +static bool > +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset, > + HOST_WIDE_INT *size) > +{ > + rtx addr_rtx; > + if MEM_SIZE_KNOWN_P (mem) > + *size = MEM_SIZE (mem); > + else > + return false; > + > + if (GET_CODE (XEXP (mem, 0)) == CONST) > + addr_rtx = XEXP (XEXP (mem, 0), 0); > + else > + addr_rtx = (XEXP (mem, 0)); > + > + if (GET_CODE (addr_rtx) == REG > + || GET_CODE (addr_rtx) == SYMBOL_REF) > + { > + *base = addr_rtx; > + *offset = 0; > + } > + else if (GET_CODE (addr_rtx) == PLUS > + && CONST_INT_P (XEXP (addr_rtx, 1))) > + { > + *base = XEXP (addr_rtx, 0); > + *offset = INTVAL (XEXP (addr_rtx, 1)); > + } > + else > + return false; > + > + return true; > +} > + > +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address, > + return true. */ > + > +extern bool > +adjacent_mem_locations (rtx mem1, rtx mem2) > +{ > + rtx base1, base2; > + HOST_WIDE_INT off1, size1, off2, size2; > + > + if (get_memref_parts (mem1, &base1, &off1, &size1) > + && get_memref_parts (mem2, &base2, &off2, &size2)) > + { > + if (GET_CODE (base1) == SYMBOL_REF > + && GET_CODE (base2) == SYMBOL_REF > + && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2)) > + return (off1 + size1 == off2); > + else if (REG_P (base1) > + && REG_P (base2) > + && REGNO (base1) == REGNO (base2)) > + return (off1 + size1 == off2); > + } > + return false; > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_RETURN_IN_MEMORY > #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 72a4d6d..4bf8461 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -15606,3 +15606,37 @@ > [(set_attr "type" "sselog1") > (set_attr "length_immediate" "1") > (set_attr "mode" "TI")]) > + > +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > +;; is true. > +(define_peephole2 > + [(set (match_operand:DF 0 "register_operand") > + (match_operand:DF 1 "memory_operand")) > + (set (match_operand:V2DF 2 "register_operand") > + (vec_concat:V2DF (match_dup 0) > + (match_operand:DF 3 "memory_operand")))] > + "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > + && REGNO (operands[0]) == REGNO (operands[2]) > + && adjacent_mem_locations (operands[1], operands[3])" > + [(set (match_dup 2) > + (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))] > +{ > + operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0)); > +}) > + > +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL > +;; is true. > +(define_peephole2 > + [(set (match_operand:DF 0 "memory_operand") > + (vec_select:DF (match_operand:V2DF 1 "register_operand") > + (parallel [(const_int 0)]))) > + (set (match_operand:DF 2 "memory_operand") > + (vec_select:DF (match_dup 1) > + (parallel [(const_int 1)])))] > + "TARGET_SSE_UNALIGNED_STORE_OPTIMAL > + && adjacent_mem_locations (operands[0], operands[2])" > + [(set (match_dup 3) > + (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))] > +{ > + operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0)); > +}) > diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c > b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c > new file mode 100644 > index 0000000..28470ce > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mtune=corei7 -O2" } */ > + > +#include <emmintrin.h> > + > +double a[1000]; > + > +__m128d foo1() { > + __m128d res; > + res = _mm_load_sd(&a[1]); > + res = _mm_loadh_pd(res, &a[2]); > + return res; > +} > + > +void foo2(__m128d res) { > + _mm_store_sd(&a[1], res); > + _mm_storeh_pd(&a[2], res); > +} > + > +/* { dg-final { scan-assembler-times "movup" 2 } } */