Hi, This patch corrects the various vsx_set_* and vsx_extract_* patterns to work correctly with little endian. For the most part this requires the usual "subtract from N-1" modification, where N is the number of elements.
Extracting element zero for big endian V2DI or V2DF mode is optimized using the scalar register equivalence. Since we can similarly optimize extraction of element one for big endian V2DI or V2DF mode, I added a variant that does this. I am not sure how useful this is, and we can remove it if you like. The existing testcase gcc.target/powerpc/pr48258-1.c fails when counting the number of occurrences of xxsldwi. It expects to see 6, but we generate 9 of them for LE. This is because there are three extracts of element zero of a V4SF in the testcase. The scalar equivalence allows us to avoid the xxsldwi in BE but not in LE. Therefore I've disabled this test for little endian. Bootstrapped and tested on powerpc64{,le}-unknown-linux-gnu with no regressions. Is this ok for trunk? Thanks, Bill gcc: 2013-11-20 Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/vsx.md (vsx_set_<mode>): Adjust for little endian. (vsx_extract_<mode>): Likewise. (*vsx_extract_<mode>_one_le): New LE variant on *vsx_extract_<mode>_zero. (vsx_extract_v4sf): Adjust for little endian. gcc/testsuite: 2013-11-20 Bill Schmidt <wschm...@linux.vnet.ibm.com> * gcc.target/powerpc/pr48258-1.c: Skip for little endian. Index: gcc/testsuite/gcc.target/powerpc/pr48258-1.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/pr48258-1.c (revision 205053) +++ gcc/testsuite/gcc.target/powerpc/pr48258-1.c (working copy) @@ -1,5 +1,6 @@ /* { dg-do compile } */ /* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*le-*-* } { "*" } { "" } } */ /* { dg-require-effective-target powerpc_vsx_ok } */ /* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */ /* { dg-final { scan-assembler-times "xvaddsp" 3 } } */ Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 205053) +++ gcc/config/rs6000/vsx.md (working copy) @@ -1497,9 +1497,10 @@ UNSPEC_VSX_SET))] "VECTOR_MEM_VSX_P (<MODE>mode)" { - if (INTVAL (operands[3]) == 0) + int idx_first = BYTES_BIG_ENDIAN ? 0 : 1; + if (INTVAL (operands[3]) == idx_first) return \"xxpermdi %x0,%x2,%x1,1\"; - else if (INTVAL (operands[3]) == 1) + else if (INTVAL (operands[3]) == 1 - idx_first) return \"xxpermdi %x0,%x1,%x2,0\"; else gcc_unreachable (); @@ -1514,8 +1515,12 @@ [(match_operand:QI 2 "u5bit_cint_operand" "i,i,i")])))] "VECTOR_MEM_VSX_P (<MODE>mode)" { + int fldDM; gcc_assert (UINTVAL (operands[2]) <= 1); - operands[3] = GEN_INT (INTVAL (operands[2]) << 1); + fldDM = INTVAL (operands[2]) << 1; + if (!BYTES_BIG_ENDIAN) + fldDM = 3 - fldDM; + operands[3] = GEN_INT (fldDM); return \"xxpermdi %x0,%x1,%x1,%3\"; } [(set_attr "type" "vecperm")]) @@ -1535,6 +1540,21 @@ (const_string "fpload"))) (set_attr "length" "4")]) +;; Optimize extracting element 1 from memory for little endian +(define_insn "*vsx_extract_<mode>_one_le" + [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=ws,d,?wa") + (vec_select:<VS_scalar> + (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z") + (parallel [(const_int 1)])))] + "VECTOR_MEM_VSX_P (<MODE>mode) && !WORDS_BIG_ENDIAN" + "lxsd%U1x %x0,%y1" + [(set (attr "type") + (if_then_else + (match_test "update_indexed_address_mem (operands[1], VOIDmode)") + (const_string "fpload_ux") + (const_string "fpload"))) + (set_attr "length" "4")]) + ;; Extract a SF element from V4SF (define_insn_and_split "vsx_extract_v4sf" [(set (match_operand:SF 0 "vsx_register_operand" "=f,f") @@ -1555,7 +1575,7 @@ rtx op2 = operands[2]; rtx op3 = operands[3]; rtx tmp; - HOST_WIDE_INT ele = INTVAL (op2); + HOST_WIDE_INT ele = BYTES_BIG_ENDIAN ? INTVAL (op2) : 3 - INTVAL (op2); if (ele == 0) tmp = op1;