Re: [PATCH, rev2], Fix PR target/pr83862: Fix PowerPC long double signbit with -mabi=ieeelongdouble

Michael Meissner Sun, 21 Jan 2018 16:05:15 -0800

I've reworked the patch for PR83864.  This bug is due to the compiler issuing
an internal error for code of the form on a little endian system:


    int sb (_Float128 *p, unsigned long n) { return __builtin_signbit (p[n]); }

The problem is that the memory optimization wants to load the high double word
from memory to isolate the signbit.  In little endian, the high word is at
offset 8 instead of 0.

The bug will also show up if you use the long double type, and you use the
-mabi=ieeelongdouble option.

Previously the code did one UNSPEC (signbit<mode>2_dm) that was done before
register allocation, and combined the value being in vector register, memory,
and GPR register along with the shift to isolate the signbit.  Then after the
split after register allocation, it created a second UNSPEC
(signbit<mode>2_dm2) that was just the direct move, or it did the memory and
GPR code path separately.

With these patches, the generator code issues an UNSPEC (signbit<mode>2_dm)
just to get the high double word, and the shift right is then emitted.  The
UNSPEC only handles the value being in either vector or GPR register.  There is
a second UNSPEC that is created by the combiner if the value is in memory.  On
little endian systems, the first split pass (before register allocation) will
allocate a pseudo register to hold the initial ADD of the base and index
registers for indexed loads, and then forms a LD reg,8(tmp) to load the high
word.  Just in case, some code after register allocation reforms the UNSPEC, it
uses a base register for the load, and it can use the base register as needed
for the temporary.

I have run this code on both little endian and big endian power8 systems, doing
bootstrap and regression test.  There were no regressions.  Can I install this
bug on the trunk?

I don't believe the bug shows up in gcc 6/7 branches, but I will build these
and test to see if the code is needed to be backported.

[gcc]
2018-01-21  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/83862
        * config/rs6000/rs6000-protos.h (rs6000_split_signbit): Delete,
        no longer used.
        * config/rs6000/rs6000.c (rs6000_split_signbit): Likewise.
        * config/rs6000/rs6000.md (signbit<mode>2): Change code for IEEE
        128-bit to produce an UNSPEC move to get the double word with the
        signbit and then a shift directly to do signbit.
        (signbit<mode>2_dm): Replace old IEEE 128-bit signbit
        implementation with a new version that just does either a direct
        move or a regular move.  Move memory interface to separate insns.
        Move insns so they are next to the expander.
        (signbit<mode>2_dm_mem_be): New combiner insns to combine load
        with signbit move.  Split big and little endian case.
        (signbit<mode>2_dm_mem_le): Likewise.
        (signbit<mode>2_dm_<su>ext): Delete, no longer used.
        (signbit<mode>2_dm2): Likewise.

[gcc/testsuite]
2018-01-22  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/83862
        * gcc.target/powerpc/pr83862.c: New test.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797

Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   (revision 256866)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -132,7 +132,6 @@ extern int rs6000_emit_cmove (rtx, rtx, 
 extern int rs6000_emit_int_cmove (rtx, rtx, rtx, rtx);
 extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_emit_minmax (rtx, enum rtx_code, rtx, rtx);
-extern void rs6000_split_signbit (rtx, rtx);
 extern void rs6000_expand_atomic_compare_and_swap (rtx op[]);
 extern rtx swap_endian_selector_for_mode (machine_mode mode);
 
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 256866)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -23404,49 +23404,6 @@ rs6000_emit_minmax (rtx dest, enum rtx_c
     emit_move_insn (dest, target);
 }
 
-/* Split a signbit operation on 64-bit machines with direct move.  Also allow
-   for the value to come from memory or if it is already loaded into a GPR.  */
-
-void
-rs6000_split_signbit (rtx dest, rtx src)
-{
-  machine_mode d_mode = GET_MODE (dest);
-  machine_mode s_mode = GET_MODE (src);
-  rtx dest_di = (d_mode == DImode) ? dest : gen_lowpart (DImode, dest);
-  rtx shift_reg = dest_di;
-
-  gcc_assert (FLOAT128_IEEE_P (s_mode) && TARGET_POWERPC64);
-
-  if (MEM_P (src))
-    {
-      rtx mem = (WORDS_BIG_ENDIAN
-                ? adjust_address (src, DImode, 0)
-                : adjust_address (src, DImode, 8));
-      emit_insn (gen_rtx_SET (dest_di, mem));
-    }
-
-  else
-    {
-      unsigned int r = reg_or_subregno (src);
-
-      if (INT_REGNO_P (r))
-       shift_reg = gen_rtx_REG (DImode, r + (BYTES_BIG_ENDIAN == 0));
-
-      else
-       {
-         /* Generate the special mfvsrd instruction to get it in a GPR.  */
-         gcc_assert (VSX_REGNO_P (r));
-         if (s_mode == KFmode)
-           emit_insn (gen_signbitkf2_dm2 (dest_di, src));
-         else
-           emit_insn (gen_signbittf2_dm2 (dest_di, src));
-       }
-    }
-
-  emit_insn (gen_lshrdi3 (dest_di, shift_reg, GEN_INT (63)));
-  return;
-}
-
 /* A subroutine of the atomic operation splitters.  Jump to LABEL if
    COND is true.  Mark the jump as unlikely to be taken.  */
 
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 256866)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -4779,12 +4779,19 @@ (define_expand "signbit<mode>2"
 {
   if (FLOAT128_IEEE_P (<MODE>mode))
     {
+      rtx dest = operands[0];
+      rtx src = operands[1];
+      rtx tmp = gen_reg_rtx (DImode);
+      rtx dest_di = gen_lowpart (DImode, dest);
+
       if (<MODE>mode == KFmode)
-       emit_insn (gen_signbitkf2_dm (operands[0], operands[1]));
+       emit_insn (gen_signbitkf2_dm (tmp, src));
       else if (<MODE>mode == TFmode)
-       emit_insn (gen_signbittf2_dm (operands[0], operands[1]));
+       emit_insn (gen_signbittf2_dm (tmp, src));
       else
        gcc_unreachable ();
+
+      emit_insn (gen_lshrdi3 (dest_di, tmp, GEN_INT (63)));
       DONE;
     }
   operands[2] = gen_reg_rtx (DFmode);
@@ -4805,6 +4812,77 @@ (define_expand "signbit<mode>2"
     }
 })
 
+;; Optimize IEEE 128-bit signbit on 64-bit systems with direct move to avoid
+;; multiple direct moves that would happen if we use the machine independent
+;; code (MFVSRD/MFVSRLD for ISA 3.0, and 2 MFVSRD's + 1-2 XXPERMDI's for ISA
+;; 2.07).
+;;
+;;  We can't use SUBREG:DI of the IEEE 128-bit value before register
+;; allocation, because KF/TFmode aren't tieable with DImode.  Also, having the
+;; 64-bit scalar part in the high end of the register instead of the low end
+;; also complicates things.  So instead, we use an UNSPEC to do the move of the
+;; high double word to isolate the signbit.
+;;
+;; After register allocation, if the _Float128 had originally been in GPRs, the
+;; split allows the post reload phases to eliminate the move, and do the shift
+;; directly with the register that contains the signbit.
+(define_insn_and_split "signbit<mode>2_dm"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r,r")
+       (unspec:DI [(match_operand:SIGNBIT 1 "gpc_reg_operand" "wa,r")]
+                  UNSPEC_SIGNBIT))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "@
+   mfvsrd %0,%x1
+   #"
+  "&& reload_completed && int_reg_operand (operands[1], <MODE>mode)"
+  [(set (match_dup 0)
+       (match_dup 2))]
+{
+  operands[2] = gen_highpart (DImode, operands[1]);
+}
+ [(set_attr "type" "mftgpr,*")])
+
+;; Optimize IEEE 128-bit signbit on to avoid loading the value into a vector
+;; register and then doing a direct move if the value comes from memory.  On
+;; little endian, we have to load the 2nd double-word to get the sign bit.
+;; Simple indirect loads and loads with integer constants can be done by
+;; adjusting the load.  Otherwise we have to issue an ADD to feed into the
+;; load.  Generally, the split will occur before the register allocation pass,
+;; and we will create a new temporary.  Otherwise, reuse the destination
+;; register if the split occurred after register allocation.
+(define_insn_and_split "*signbit<mode>2_dm_mem"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=b")
+       (unspec:DI [(match_operand:SIGNBIT 1 "memory_operand" "m")]
+                  UNSPEC_SIGNBIT))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (match_dup 2))]
+{
+  rtx dest = operands[0];
+  rtx src = operands[1];
+  rtx addr = XEXP (src, 0);
+
+  if (WORDS_BIG_ENDIAN)
+    operands[2] = adjust_address (src, DImode, 0);
+
+  else if (REG_P (addr) || SUBREG_P (addr))
+    operands[2] = adjust_address (src, DImode, 8);
+
+  else if (GET_CODE (addr) == PLUS && REG_P (XEXP (addr, 0))
+          && CONST_INT_P (XEXP (addr, 1)) && mem_operand_gpr (src, DImode))
+    operands[2] = adjust_address (src, DImode, 8);
+
+  else
+    {
+      rtx tmp = can_create_pseudo_p () ? gen_reg_rtx (DImode) : dest;
+      emit_insn (gen_rtx_SET (tmp, addr));
+      operands[2] = change_address (src, DImode,
+                                   gen_rtx_PLUS (DImode, tmp, GEN_INT (8)));
+    }
+})
+
 (define_expand "copysign<mode>3"
   [(set (match_dup 3)
         (abs:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "")))
@@ -4834,54 +4912,6 @@ (define_expand "copysign<mode>3"
    operands[5] = CONST0_RTX (<MODE>mode);
   })
 
-;; Optimize signbit on 64-bit systems with direct move to avoid doing the store
-;; and load.
-(define_insn_and_split "signbit<mode>2_dm"
-  [(set (match_operand:SI 0 "gpc_reg_operand" "=r,r,r")
-       (unspec:SI
-        [(match_operand:SIGNBIT 1 "input_operand" "wa,m,r")]
-        UNSPEC_SIGNBIT))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  rs6000_split_signbit (operands[0], operands[1]);
-  DONE;
-}
- [(set_attr "length" "8,8,4")
-  (set_attr "type" "mftgpr,load,integer")])
-
-(define_insn_and_split "*signbit<mode>2_dm_<su>ext"
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=r,r,r")
-       (any_extend:DI
-        (unspec:SI
-         [(match_operand:SIGNBIT 1 "input_operand" "wa,m,r")]
-         UNSPEC_SIGNBIT)))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-{
-  rs6000_split_signbit (operands[0], operands[1]);
-  DONE;
-}
- [(set_attr "length" "8,8,4")
-  (set_attr "type" "mftgpr,load,integer")])
-
-;; TARGET_MODES_TIEABLE_P doesn't allow DImode to be tied with the various
-;; floating point types, which makes normal SUBREG's problematical.  Instead
-;; use a special pattern to avoid using a normal movdi.
-(define_insn "signbit<mode>2_dm2"
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-       (unspec:DI [(match_operand:SIGNBIT 1 "gpc_reg_operand" "wa")
-                   (const_int 0)]
-                  UNSPEC_SIGNBIT))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mfvsrd %0,%x1"
- [(set_attr "type" "mftgpr")])
-
-
 ;; Use an unspec rather providing an if-then-else in RTL, to prevent the
 ;; compiler from optimizing -0.0
 (define_insn "copysign<mode>3_fcpsgn"
Index: gcc/testsuite/gcc.target/powerpc/pr83862.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr83862.c  (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr83862.c  (revision 0)
@@ -0,0 +1,34 @@
+/* PR target/83862.c */
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-require-effective-target ppc_float128_sw } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mpower8-vector -O2 -mabi=ieeelongdouble -Wno-psabi" } */
+
+/* On little endian systems, optimizing signbit of IEEE 128-bit values from
+   memory could abort if the memory address was indexed (reg+reg).  The
+   optimization is only on 64-bit machines with direct move.
+
+   Compile with -g -O2 -mabi=ieeelongdouble -Wno-psabi.  */
+
+#ifndef TYPE
+#define TYPE long double
+#endif
+
+int sbr (TYPE a) { return __builtin_signbit (a); }
+int sbm (TYPE *a) { return __builtin_signbit (*a); }
+int sbo (TYPE *a) { return __builtin_signbit (a[4]); }
+int sbi (TYPE *a, unsigned long n) { return __builtin_signbit (a[n]); }
+void sbs (int *p, TYPE a) { *p = __builtin_signbit (a); }
+
+/* On big endian systems, this will generate 2 LDs and 1 LDX, while on
+   little endian systems, this will generate 3 LDs and an ADD.  */
+
+/* { dg-final { scan-assembler-times {\mldx?\M}    3 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrd\M}  2 } } */
+/* { dg-final { scan-assembler-times {\msrdi\M}    5 } } */
+/* { dg-final { scan-assembler-not   {\mmfvsrld\M}   } } */
+/* { dg-final { scan-assembler-not   {\mstxvx?\M}    } } */
+/* { dg-final { scan-assembler-not   {\mstxvw4x\M}   } } */
+/* { dg-final { scan-assembler-not   {\mstxvd2x\M}   } } */
+/* { dg-final { scan-assembler-not   {\mstvx\M}      } } */
+

Re: [PATCH, rev2], Fix PR target/pr83862: Fix PowerPC long double signbit with -mabi=ieeelongdouble

Reply via email to