On Tue, Feb 22, 2022 at 12:46 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> We ICE on the following testcase for -m32 since r12-3435. because
> operands[2] is (subreg:SF (reg:DI ...) 0) and
According to validate_subreg, (subreg:V4SF (reg:DI ...) 0) should be
valid(but not sure if it really works )

For -m64 part:
I saw operands[2] is (subreg:SF (reg:SI ..) 0).
It's a bit weird that (subreg:V4SF (reg:SF ..) 0) and (subreg:SF
(reg:SI 0)) are legal but (subreg:V4SF (reg:SI ..) not, and last time
I tried to remove those weird codes in validate_subreg but caused a
lot of regression[1].
And it seems we need to be aware of all pre_reload paradoxical subregs
of V*{SF,HF}mode, op, {SF,HF}mode).

 940  /* ??? x86 sse code makes heavy use of *paradoxical* vector subregs,
 941     i.e. (subreg:V4SF (reg:SF) 0) or (subreg:V4SF (reg:V2SF) 0).  This
 942     surely isn't the cleanest way to represent this.  It's questionable
 943     if this ought to be represented at all -- why can't this all be hidden
 944     in post-reload splitters that make arbitrarily mode changes to the
 945     registers themselves.  */
 946  else if (VECTOR_MODE_P (omode)
 947           && GET_MODE_INNER (omode) == GET_MODE_INNER (imode))

Also i;m thinking about(too risky at stage4, leave it to GCC13 stage
1) adjusting GET_MODE_INNER (omode) == GET_MODE_INNER (imode) from the
same mode to the same size.
The same size looks a little more logically coherent.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578186.html

> lowpart_subreg (V4SFmode, operands[2], SFmode)
> returns NULL, and that is what we use in AND etc. insns we emit.
>
> The following patch (non-attached) fixes that by calling force_reg for the
> input operands, to make sure they are really REGs and so lowpart_subreg
> will succeed on them - even for theoretical MEMs using REGs there seems
> desirable, we don't want to read following memory slots for the paradoxical
> subreg.  For the outputs, I thought we'd get better code by always computing
> result into a new pseudo and them move lowpart of that pseudo into dest.
>
> I've bootstrapped/regtested this version on x86_64-linux and i686-linux,
> unfortunately it regressed
> FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps
> on which the patch changes:
>         vandps  .LC0(%rip), %xmm1, %xmm1
> -       vxorps  %xmm0, %xmm1, %xmm0
> +       vxorps  %xmm0, %xmm1, %xmm1
> +       vmovaps %xmm1, %xmm0
>         ret
> The RA sees:
> (insn 8 4 9 2 (set (reg:V4SF 85)
>         (and:V4SF (subreg:V4SF (reg:SF 90) 0)
>             (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16 
> A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3}
>      (expr_list:REG_DEAD (reg:SF 90)
>         (nil)))
> (insn 9 8 10 2 (set (reg:V4SF 87)
>         (xor:V4SF (reg:V4SF 85)
>             (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3}
>      (expr_list:REG_DEAD (reg:SF 89)
>         (expr_list:REG_DEAD (reg:V4SF 85)
>             (nil))))
> (insn 10 9 14 2 (set (reg:SF 82 [ <retval> ])
>         (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:V4SF 87)
>         (nil)))
> (insn 14 10 15 2 (set (reg/i:SF 20 xmm0)
>         (reg:SF 82 [ <retval> ])) "pr89984-2.c":8:1 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:SF 82 [ <retval> ])
>         (nil)))
> (insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1
>      (nil))
> and doesn't know that if it would use xmm0 not just for pseudo 82
> but also for pseudo 87, it could create a noop move in insn 10 and
> so could avoid an extra register copy and nothing later on is able
> to figure that out either.  I don't know how the RA should know
> that though.
>
> Anyway, so that we don't regress, I have an alternative patch in attachment,
> which will do this stuff (i.e. use fresh vector pseudo as destination and
> then move lowpart of that to dest) over what it used before (i.e.
> use paradoxical subreg of the dest) only if lowpart_subreg returns NULL.
>
> Ok for trunk if the attached version passes bootstrap/regtest?
>
> 2022-02-21  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/104612
>         * config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
>         on input operands before calling lowpart_subreg on it.  For output
>         operand, use a vmode pseudo as destination and then move its lowpart
>         subreg into operands[0].
>         (ix86_expand_xorsign): Likewise.
>
>         * gcc.dg/pr104612.c: New test.
>
> --- gcc/config/i386/i386-expand.cc.jj   2022-02-09 20:45:03.463499205 +0100
> +++ gcc/config/i386/i386-expand.cc      2022-02-21 13:14:31.756657743 +0100
> @@ -2153,7 +2153,7 @@ void
>  ix86_expand_copysign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, op2, op3;
> +  rtx dest, vdest, op0, op1, mask, op2, op3;
>
>    mode = GET_MODE (operands[0]);
>
> @@ -2174,8 +2174,9 @@ ix86_expand_copysign (rtx operands[])
>        return;
>      }
>
> -  dest = lowpart_subreg (vmode, operands[0], mode);
> -  op1 = lowpart_subreg (vmode, operands[2], mode);
> +  dest = operands[0];
> +  vdest = gen_reg_rtx (vmode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
>    if (CONST_DOUBLE_P (operands[1]))
> @@ -2184,7 +2185,8 @@ ix86_expand_copysign (rtx operands[])
>        /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  
> */
>        if (op0 == CONST0_RTX (mode))
>         {
> -         emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>           return;
>         }
>
> @@ -2193,7 +2195,7 @@ ix86_expand_copysign (rtx operands[])
>        op0 = force_reg (vmode, op0);
>      }
>    else
> -    op0 = lowpart_subreg (vmode, operands[1], mode);
> +    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
>
>    op2 = gen_reg_rtx (vmode);
>    op3 = gen_reg_rtx (vmode);
> @@ -2201,7 +2203,8 @@ ix86_expand_copysign (rtx operands[])
>                                     gen_rtx_NOT (vmode, mask),
>                                     op0));
>    emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
> -  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  /* Expand an xorsign operation.  */
> @@ -2210,7 +2213,7 @@ void
>  ix86_expand_xorsign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, x, temp;
> +  rtx dest, vdest, op0, op1, mask, x, temp;
>
>    dest = operands[0];
>    op0 = operands[1];
> @@ -2230,15 +2233,17 @@ ix86_expand_xorsign (rtx operands[])
>    temp = gen_reg_rtx (vmode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
> -  op1 = lowpart_subreg (vmode, op1, mode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
>    x = gen_rtx_AND (vmode, op1, mask);
>    emit_insn (gen_rtx_SET (temp, x));
>
> -  op0 = lowpart_subreg (vmode, op0, mode);
> +  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
>    x = gen_rtx_XOR (vmode, temp, op0);
>
> -  dest = lowpart_subreg (vmode, dest, mode);
> -  emit_insn (gen_rtx_SET (dest, x));
> +  vdest = gen_reg_rtx (vmode);
> +  emit_insn (gen_rtx_SET (vdest, x));
> +
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
> --- gcc/testsuite/gcc.dg/pr104612.c.jj  2022-02-21 13:26:41.134606451 +0100
> +++ gcc/testsuite/gcc.dg/pr104612.c     2022-02-21 13:26:18.247922789 +0100
> @@ -0,0 +1,27 @@
> +/* PR target/104612 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-msse2 -mfpmath=sse" { target i?86-*-* 
> x86_64-*-* } } */
> +
> +struct V { float x, y; };
> +
> +struct V
> +foo (struct V v)
> +{
> +  struct V ret;
> +  ret.x = __builtin_copysignf (1.0e+0, v.x);
> +  ret.y = __builtin_copysignf (1.0e+0, v.y);
> +  return ret;
> +}
> +
> +float
> +bar (struct V v)
> +{
> +  return __builtin_copysignf (v.x, v.y);
> +}
> +
> +float
> +baz (struct V v)
> +{
> +  return v.x * __builtin_copysignf (1.0f, v.y);
> +}
>
>         Jakub



--
BR,
Hongtao

Reply via email to