Hi!

The following patch fixes wrong-code on the following testcase extracted
from pseudo-RNG with e.g. -march=zEC12 -O2.
The problem is in the instruction emitted by the *r<noxa>sbg_sidi_srl
patterns.  We have in *.final correct:
(insn 67 65 68 (parallel [
            (set (reg:SI 1 %r1 [189])
                (xor:SI (subreg:SI (zero_extract:DI (reg/v:DI 11 %r11 [orig:89 
th ] [89])
                            (const_int 32 [0x20])
                            (const_int 24 [0x18])) 4)
                    (reg:SI 1 %r1 [187])))
            (clobber (reg:CC 33 %cc))
        ]) "pr89369.c":44:73 1419 {*rxsbg_sidi_srl}
     (expr_list:REG_DEAD (reg/v:DI 11 %r11 [orig:89 th ] [89])
        (expr_list:REG_UNUSED (reg:CC 33 %cc)
            (nil))))
which is effectively (reg:SI %r1) ^= (unsigned) ((reg:DI %r11) >> 8).
But the pattern emits rxsbg     %r1,%r11,40,63,56 which is effectively
(reg:SI %r1) ^= ((unsigned) ((reg:DI %r11) >> 8) & 0xffffff)
or equivalently
(reg:SI %r1) ^= ((reg:SI %r11) >> 8).  Even in the pattern one can see
that it wants to extract exactly 32 bits always, no matter what the shift
count is.  Fixed by always emitting 32,63,(32+pos from zero extract).
On that pr89369.c testcase, the patch also changes
-       rxsbg   %r12,%r9,64,63,32
-       rxsbg   %r12,%r1,64,63,32
+       rxsbg   %r12,%r9,32,63,32
+       rxsbg   %r12,%r1,32,63,32
and
-       rxsbg   %r1,%r3,64,63,32
+       rxsbg   %r1,%r3,32,63,32
which are all with zero_extract with len 32 and pos 0, so again, it wants
to extract the low 32 bits.  I3 64 larger than I4 63 is just weird.
The patch also changes the instructions emitted in rXsbg_mode_sXl.c:
-       rosbg   %r2,%r3,34,63,62
+       rosbg   %r2,%r3,32,63,62
and
-       rxsbg   %r2,%r3,34,63,62
+       rxsbg   %r2,%r3,32,63,62
Here, it is
__attribute__ ((noinline)) unsigned int
rosbg_si_srl (unsigned int a, unsigned int b)
{
  return a | (b >> 2);
}
__attribute__ ((noinline)) unsigned int
rxsbg_si_srl (unsigned int a, unsigned int b)
{
  return a ^ (b >> 2);
}
so from quick POV, one might think 34,63,62 is better, as we want to or in
just the 30 bits from b.  Both should actually work the same though, because
(subreg/s/v:SI (reg/v:DI 64 [ b+-4 ]) 4) - the b argument is passed zero
extended and so it doesn't really matter how many bits we extract, as long
as it is 30 or more.  If I try instead:
__attribute__ ((noinline, noipa)) unsigned int
rosbg_si_srl (unsigned int a, unsigned long long b)
{
  return a | ((unsigned) b >> 2);
}
__attribute__ ((noinline, noipa)) unsigned int
rxsbg_si_srl (unsigned int a, unsigned long long b)
{
  return a ^ ((unsigned) b >> 2);
}
then both the unpatched and patched compiler emit properly
        rosbg   %r2,%r3,34,63,62
and
        rxsbg   %r2,%r3,34,63,62
through a different pattern, because in that case we must not rely on the
upper 32 bits of b being zero.

In addition to this change, the patch adds a cleanup, there is no reason to
use a static buffer in each instruction and increase global state, we can
just tweak the arguments and let the caller deal with it.  That is something
normally done in other parts of the s390.md as well.  As small CONST_INTs
are hashed, it shouldn't increase compile time memory.

Bootstrapped/regtested on s390x-linux, ok for trunk?

2019-02-16  Jakub Jelinek  <ja...@redhat.com>

        PR target/89369
        * config/s390/s390.md (*r<noxa>sbg_<mode>_srl_bitmask,
        *r<noxa>sbg_<mode>_sll, *r<noxa>sbg_<mode>_srl): Don't construct
        pattern in a temporary buffer.
        (*r<noxa>sbg_sidi_srl): Likewise.  Always use 32 as I3 rather
        than 64-operands[2].

        * gcc.c-torture/execute/pr89369.c: New test.
        * gcc.target/s390/md/rXsbg_mode_sXl.c (rosbg_si_srl,
        rxsbg_si_srl): Expect last 3 operands 32,63,62 rather than
        34,63,62.

--- gcc/config/s390/s390.md.jj  2019-02-05 22:59:04.883503954 +0100
+++ gcc/config/s390/s390.md     2019-02-15 18:54:35.037131906 +0100
@@ -4263,10 +4263,8 @@ (define_insn "*r<noxa>sbg_<mode>_srl_bit
    && s390_extzv_shift_ok (<bitsize>, 64 - INTVAL (operands[3]),
                            INTVAL (operands[2]))"
   {
-    static char buffer[256];
-    sprintf (buffer, "r<noxa>sbg\t%%0,%%1,%%<bfstart>2,%%<bfend>2,%ld",
-             64 - INTVAL (operands[3]));
-    return buffer;
+    operands[3] = GEN_INT (64 - INTVAL (operands[3]));
+    return "r<noxa>sbg\t%0,%1,%<bfstart>2,%<bfend>2,%3";
   }
   [(set_attr "op_type" "RIE")])
 
@@ -4301,10 +4299,8 @@ (define_insn "*r<noxa>sbg_<mode>_sll"
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z10"
   {
-    static char buffer[256];
-    sprintf (buffer, "r<noxa>sbg\t%%0,%%1,<bitoff>,%ld,%%2",
-             63 - INTVAL (operands[2]));
-    return buffer;
+    operands[3] = GEN_INT (63 - INTVAL (operands[2]));
+    return "r<noxa>sbg\t%0,%1,<bitoff>,%3,%2";
   }
   [(set_attr "op_type" "RIE")])
 
@@ -4322,10 +4318,9 @@ (define_insn "*r<noxa>sbg_<mode>_srl"
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z10"
   {
-    static char buffer[256];
-    sprintf (buffer, "r<noxa>sbg\t%%0,%%1,%ld,63,%ld",
-             <bitoff_plus> INTVAL (operands[2]), 64 - INTVAL (operands[2]));
-    return buffer;
+    operands[3] = GEN_INT (64 - INTVAL (operands[2]));
+    operands[2] = GEN_INT (<bitoff_plus> INTVAL (operands[2]));
+    return "r<noxa>sbg\t%0,%1,%2,63,%3";
   }
   [(set_attr "op_type" "RIE")])
 
@@ -4343,10 +4338,8 @@ (define_insn "*r<noxa>sbg_sidi_srl"
    (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z10"
   {
-    static char buffer[256];
-    sprintf (buffer, "r<noxa>sbg\t%%0,%%1,%ld,63,%ld",
-             64 - INTVAL (operands[2]), 32 + INTVAL (operands[2]));
-    return buffer;
+    operands[2] = GEN_INT (32 + INTVAL (operands[2]));
+    return "r<noxa>sbg\t%0,%1,32,63,%2";
   }
   [(set_attr "op_type" "RIE")])
 
--- gcc/testsuite/gcc.c-torture/execute/pr89369.c.jj    2019-02-15 
18:54:05.425620085 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr89369.c       2019-02-15 
18:53:40.801026052 +0100
@@ -0,0 +1,69 @@
+/* PR target/89369 */
+
+#if __SIZEOF_INT__ == 4 && __SIZEOF_LONG_LONG__ == 8 && __CHAR_BIT__ == 8
+struct S { unsigned int u[4]; };
+
+static void
+foo (struct S *out, struct S const *in, int shift)
+{
+  unsigned long long th, tl, oh, ol;
+  th = ((unsigned long long) in->u[3] << 32) | in->u[2];
+  tl = ((unsigned long long) in->u[1] << 32) | in->u[0];
+  oh = th >> (shift * 8);
+  ol = tl >> (shift * 8);
+  ol |= th << (64 - shift * 8);
+  out->u[1] = ol >> 32;
+  out->u[0] = ol;
+  out->u[3] = oh >> 32;
+  out->u[2] = oh;
+}
+
+static void
+bar (struct S *out, struct S const *in, int shift)
+{
+  unsigned long long th, tl, oh, ol;
+  th = ((unsigned long long) in->u[3] << 32) | in->u[2];
+  tl = ((unsigned long long) in->u[1] << 32) | in->u[0];
+  oh = th << (shift * 8);
+  ol = tl << (shift * 8);
+  oh |= tl >> (64 - shift * 8);
+  out->u[1] = ol >> 32;
+  out->u[0] = ol;
+  out->u[3] = oh >> 32;
+  out->u[2] = oh;
+}
+
+__attribute__((noipa)) static void
+baz (struct S *r, struct S *a, struct S *b, struct S *c, struct S *d)
+{
+  struct S x, y;
+  bar (&x, a, 1);
+  foo (&y, c, 1);
+  r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> 11) & 0xdfffffefU) ^ y.u[0] ^ 
(d->u[0] << 18);
+  r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> 11) & 0xddfecb7fU) ^ y.u[1] ^ 
(d->u[1] << 18);
+  r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> 11) & 0xbffaffffU) ^ y.u[2] ^ 
(d->u[2] << 18);
+  r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> 11) & 0xbffffff6U) ^ y.u[3] ^ 
(d->u[3] << 18);
+}
+
+int
+main ()
+{
+  struct S a[] = { { 0x000004d3, 0xbc5448db, 0xf22bde9f, 0xebb44f8f },
+                  { 0x03a32799, 0x60be8246, 0xa2d266ed, 0x7aa18536 },
+                  { 0x15a38518, 0xcf655ce1, 0xf3e09994, 0x50ef69fe },
+                  { 0x88274b07, 0xe7c94866, 0xc0ea9f47, 0xb6a83c43 },
+                  { 0xcd0d0032, 0x5d47f5d7, 0x5a0afbf6, 0xaea87b24 },
+                  { 0, 0, 0, 0 } };
+  baz (&a[5], &a[0], &a[1], &a[2], &a[3]);
+  if (a[4].u[0] != a[5].u[0] || a[4].u[1] != a[5].u[1]
+      || a[4].u[2] != a[5].u[2] || a[4].u[3] != a[5].u[3])
+    __builtin_abort ();
+  return 0;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
--- gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c.jj        2018-11-16 
17:33:44.126195406 +0100
+++ gcc/testsuite/gcc.target/s390/md/rXsbg_mode_sXl.c   2019-02-16 
19:16:54.372512707 +0100
@@ -46,7 +46,7 @@ rosbg_si_srl (unsigned int a, unsigned i
 {
   return a | (b >> 2);
 }
-/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,34,63,62" 1 } } */
+/* { dg-final { scan-assembler-times "rosbg\t%r.,%r.,32,63,62" 1 } } */
 
 __attribute__ ((noinline)) unsigned int
 rxsbg_si_sll (unsigned int a, unsigned int b)
@@ -60,7 +60,7 @@ rxsbg_si_srl (unsigned int a, unsigned i
 {
   return a ^ (b >> 2);
 }
-/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,34,63,62" 1 } } */
+/* { dg-final { scan-assembler-times "rxsbg\t%r.,%r.,32,63,62" 1 } } */
 
 __attribute__ ((noinline)) unsigned long long
 di_sll (unsigned long long x)

        Jakub

Reply via email to