On Wed, Jun 21, 2017 at 11:49:07AM +0100, James Greenhalgh wrote:
> *ping*

*ping*x2

Thanks,
James

> On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > [Sorry for the re-send. I spotted that the attributes were not right for the
> >  new pattern I was adding. The change between this and the first version 
> > was:
> > 
> >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> >   +   (set_attr "length" "4,4,4,12")]
> > ]
> > 
> > ---
> > 
> > Hi,
> > 
> > In this testcase, all argument registers and the return register
> > will be general purpose registers:
> > 
> >   long long
> >   foo (long long a, long long b, long long c)
> >   {
> >     return ((a ^ b) & c) ^ b;
> >   }
> > 
> > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > we'll match that pattern and emit a BSL, necessitating moving all those
> > arguments and results to the Advanced SIMD registers:
> > 
> >     fmov    d2, x0
> >     fmov    d0, x2
> >     fmov    d1, x1
> >     bsl     v0.8b, v2.8b, v1.8b
> >     fmov    x0, d0
> > 
> > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split 
> > that
> > knows to split back to integer operations if the register allocation
> > falls that way.
> > 
> > We could have used an unspec, but then we lose some of the nice
> > simplifications that can be made from explicitly spelling out the semantics
> > of BSL.
> > 
> > Bootstrapped on aarch64-none-linux-gnu.
> > 
> > OK?
> > 
> > Thanks,
> > James
> > 
> > ---
> > gcc/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenha...@arm.com>
> > 
> >     * config/aarch64/aarch64-simd.md
> >     (aarch64_simd_bsl<mode>_internal): Remove DImode.
> >     (*aarch64_simd_bsl<mode>_alt): Likewise.
> >     (aarch64_simd_bsldi_internal): New.
> > 
> > gcc/testsuite/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenha...@arm.com>
> > 
> >     * gcc.target/aarch64/no-dimode-bsl.c: New.
> >     * gcc.target/aarch64/dimode-bsl.c: New.
> > 
> 
> > diff --git a/gcc/config/aarch64/aarch64-simd.md 
> > b/gcc/config/aarch64/aarch64-simd.md
> > index c5a86ff..7b6b12f 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -2256,13 +2256,13 @@
> >  ;; in *aarch64_simd_bsl<mode>_alt.
> >  
> >  (define_insn "aarch64_simd_bsl<mode>_internal"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -   (xor:VSDQ_I_DI
> > -      (and:VSDQ_I_DI
> > -        (xor:VSDQ_I_DI
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +   (xor:VDQ_I
> > +      (and:VDQ_I
> > +        (xor:VDQ_I
> >            (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> > -          (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> > -        (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > +          (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> > +        (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> >       (match_dup:<V_cmp_result> 3)
> >     ))]
> >    "TARGET_SIMD"
> > @@ -2280,14 +2280,14 @@
> >  ;; permutations of commutative operations, we have to have a separate 
> > pattern.
> >  
> >  (define_insn "*aarch64_simd_bsl<mode>_alt"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -   (xor:VSDQ_I_DI
> > -      (and:VSDQ_I_DI
> > -        (xor:VSDQ_I_DI
> > -          (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> > -          (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> > -         (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > -     (match_dup:VSDQ_I_DI 2)))]
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +   (xor:VDQ_I
> > +      (and:VDQ_I
> > +        (xor:VDQ_I
> > +          (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> > +          (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> > +         (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> > +     (match_dup:VDQ_I 2)))]
> >    "TARGET_SIMD"
> >    "@
> >    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> > @@ -2296,6 +2296,45 @@
> >    [(set_attr "type" "neon_bsl<q>")]
> >  )
> >  
> > +;; DImode is special, we want to avoid computing operations which are
> > +;; more naturally computed in general purpose registers in the vector
> > +;; registers.  If we do that, we need to move all three operands from 
> > general
> > +;; purpose registers to vector registers, then back again.  However, we
> > +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> > +;; optimizations based on the component operations of a BSL.
> > +;;
> > +;; That means we need a splitter back to the individual operations, if they
> > +;; would be better calculated on the integer side.
> > +
> > +(define_insn_and_split "aarch64_simd_bsldi_internal"
> > +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> > +   (xor:DI
> > +      (and:DI
> > +        (xor:DI
> > +          (match_operand:DI 3 "register_operand" "w,0,w,r")
> > +          (match_operand:DI 2 "register_operand" "w,w,0,r"))
> > +        (match_operand:DI 1 "register_operand" "0,w,w,r"))
> > +     (match_dup:DI 3)
> > +   ))]
> > +  "TARGET_SIMD"
> > +  "@
> > +  bsl\\t%0.8b, %2.8b, %3.8b
> > +  bit\\t%0.8b, %2.8b, %1.8b
> > +  bif\\t%0.8b, %3.8b, %1.8b
> > +  #"
> > +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> > +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> > +{
> > +  /* Split back to individual operations.  */
> > +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> > +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> > +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> > +  DONE;
> > +}
> > +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> > +   (set_attr "length" "4,4,4,12")]
> > +)
> > +
> >  (define_expand "aarch64_simd_bsl<mode>"
> >    [(match_operand:VALLDIF 0 "register_operand")
> >     (match_operand:<V_cmp_result> 1 "register_operand")
> > diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c 
> > b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > new file mode 100644
> > index 0000000..4e63511
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we can generate DImode BSL when we are using
> > +   copysign.  */
> > +
> > +double
> > +foo (double a, double b)
> > +{
> > +  return __builtin_copysign (a, b);
> > +}
> > +
> > +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c 
> > b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > new file mode 100644
> > index 0000000..67dfda0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we don't combine to BSL when in DImode, avoiding register
> > +   moves in the general case.
> > +
> > +   We want:
> > +   eor     x0, x0, x1
> > +   and     x0, x0, x2
> > +   eor     x0, x0, x1
> > +   ret
> > +
> > +   Rather than:
> > +   fmov    d2, x0
> > +   fmov    d0, x2
> > +   fmov    d1, x1
> > +   bsl     v0.8b, v2.8b, v1.8b
> > +   fmov    x0, d0
> > +   ret  */
> > +
> > +long long
> > +foo (long long a, long long b, long long c)
> > +{
> > +  return ((a ^ b) & c) ^ b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */
> 

Reply via email to