[Bug tree-optimization/113774] wrong code with _BitInt() arithmetics at -O2

jakub at gcc dot gnu.org via Gcc-bugs Wed, 07 Feb 2024 10:28:55 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113774


Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
                 CC|                            |rguenth at gcc dot gnu.org
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2024-02-07

--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
/* PR tree-optimization/113774 */
/* { dg-do run { target bitint } } */
/* { dg-options "-std=c23 -pedantic-errors" } */
/* { dg-skip-if "" { ! run_expensive_tests }  { "*" } { "-O0" "-O2" } } */
/* { dg-skip-if "" { ! run_expensive_tests } { "-flto" } { "" } } */

#if __BITINT_MAXWIDTH__ >= 512
unsigned _BitInt(512) u;
unsigned _BitInt(512) v;

void
foo (unsigned _BitInt(255) a, unsigned _BitInt(257) b, unsigned _BitInt(512)
*r)
{
  b += v;
  b |= a - b;
  unsigned _BitInt(512) c = b * 6;
  unsigned _BitInt(512) h = c >> u;
  *r = h;
}
#endif

int
main ()
{
#if __BITINT_MAXWIDTH__ >= 512
  unsigned _BitInt(512) x;
  foo (0x10000000000000000wb, 0x10000000000000001wb, &x);
  if (x !=
0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffawb)
    __builtin_abort ();
#endif
  return 0;
}

This looks like a PRE bug to me.
The bug at runtime is that b |= a - b (i.e. the operand of the multiplication)
which ought to be
0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffuwb I think
is different.
What bitint lowering emits to compute this are 2 separate loops + straight line
code after each to handle the most significant limb of 257-bit unsigned
_BitInts.
The first computes b += v; and because v is 0, it doesn't change anything and
uses
the same underlying variable (the PARM_DECL) for both the input and result, the
second
one performs the b |= a - b which is complicated because there needs to be zero
extension from 255-bit a to 257 bits.  This loop handles 2 limbs at a time, so
iterates twice:
  <bb 5> [local count: 1073741824]:
  # _49 = PHI <0(4), _50(11)>
  # _55 = PHI <0(4), _56(11)>
  _51 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
  if (_49 < 3)
    goto <bb 6>; [80.00%]
  else
    goto <bb 7>; [20.00%]

  <bb 6> [local count: 1073741824]:
  _52 = VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_49];

  <bb 7> [local count: 1073741824]:
  # _53 = PHI <_52(6), 0(5)>
  _54 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
  _57 = .USUBC (_53, _54, _55);
  _58 = IMAGPART_EXPR <_57>;
  _59 = REALPART_EXPR <_57>;
  _60 = _59 | _51;
  bitint.6[_49] = _60;
  _61 = _49 + 1;
  _62 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_61];
  if (_61 <= 3)
    goto <bb 8>; [80.00%]
  else
    goto <bb 11>; [20.00%]

  <bb 8> [local count: 1073741824]:
  if (_61 == 3)
    goto <bb 10>; [20.00%]
  else
    goto <bb 9>; [80.00%]

  <bb 9> [local count: 1073741824]:
  _63 = VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_61];
  goto <bb 11>; [100.00%]

  <bb 10> [local count: 214748360]:
  _64 = MEM <unsigned long> [(unsigned _BitInt(255) *)&a + 24B];
  _65 = (<unnamed-unsigned:63>) _64;
  _66 = (unsigned long) _65;

  <bb 11> [local count: 1073741824]:
  # _67 = PHI <_63(9), 0(7), _66(10)>
  _68 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_61];
  _69 = .USUBC (_67, _68, _58);
  _56 = IMAGPART_EXPR <_69>;
  _70 = REALPART_EXPR <_69>;
  _71 = _70 | _62;
  bitint.6[_61] = _71;
  _50 = _49 + 2;
  if (_50 != 4)
    goto <bb 5>; [0.05%]
  else
    goto <bb 12>; [99.95%]

  <bb 12> [local count: 1073741824]:
  _72 = MEM <unsigned long> [(unsigned _BitInt(257) *)&b + 32B];
  _73 = (<unnamed-unsigned:1>) _72;
  _74 = MEM <unsigned long> [(unsigned _BitInt(257) *)&b + 32B];
  _75 = (<unnamed-unsigned:1>) _74;
  _76 = (unsigned long) _75;
  _77 = .USUBC (0, _76, _56);
  _78 = IMAGPART_EXPR <_77>;
  _79 = REALPART_EXPR <_77>;
  _80 = (<unnamed-unsigned:1>) _79;
  _81 = _80 | _73;
  _82 = (unsigned long) _81;
  MEM[(unsigned long *)&bitint.6 + 32B] = _82;
  a ={v} {CLOBBER(eos)};

bitint.6 is the result of b | (zext (a) - b) which is then used as argument to
__mulbitint3.
Now, there is something I should improve, eventhough later optimizations like
VRP IMHO ought to be able to figure it out, because the loop iterates just
twice, _49 will be in the [0, 2] range (actually 0 or 2), so _49 < 3 condition
is actually always true and similarly _61 <= 3 is also always true (because _61
is in [1, 3] range (actually 1 or 3)).
Guess I should in the handle_cast handling look at the m_upwards_2limb exact
value and figure out conditions which will be always true or always false.

Anyway, with the above which is IMHO correct but non-optimal let's see what
happens to the processing of the second least significant limb, i.e. the second
half of the first iteration which is what is IMHO miscompiled during PRE.
First thread1 pass goes wild and threads everything it thinks it is possible to
thread.
The second least significant limb is handled in
  # _67 = PHI <_63(8), 0(6)>
  # _144 = PHI <_143(8), _136(6)>
  # _146 = PHI <_145(8), _140(6)>
  # _148 = PHI <_147(8), _141(6)>
  _68 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_146];
  _69 = .USUBC (_67, _68, _144);
  _56 = IMAGPART_EXPR <_69>;
  _70 = REALPART_EXPR <_69>;
  _71 = _70 | _148;
  bitint.6[_146] = _71;
and the important thing to look at is the second .USUBC argument there, so _68,
load from b's _146
index, and second LS limb is entering this from bb 8 with
  <bb 8> [local count: 1073741824]:
  # _143 = PHI <_10(7), _136(6)>
  # _145 = PHI <_4(7), _140(6)>
  # _147 = PHI <_3(7), _141(6)>
  _63 = VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_145];
  goto <bb 10>; [100.00%]
entered from bb 7, where
  _4 = _49 + 1;
and earlier:
  <bb 5> [local count: 1073741824]:
  # _49 = PHI <0(4), _50(10)>
  # _55 = PHI <0(4), _56(10)>
is the head of the whole loop entered from bb 4, so _146 is 1 in the first
iteration.
All good.  This is the state which is also in walloca2.
But PRE changes this to:
@@ -144,9 +251,10 @@ void foo (unsigned _BitInt(255) a, unsig
   # DEBUG BEGIN_STMT

   <bb 5> [local count: 1073741824]:
-  # _49 = PHI <0(4), _50(28)>
-  # _55 = PHI <0(4), _56(28)>
+  # _49 = PHI <0(4), _50(10)>
+  # _55 = PHI <0(4), _56(10)>
   _51 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_49];
+  _179 = _49 + 1;
   if (_49 <= 2)
     goto <bb 7>; [80.00%]
   else
@@ -158,9 +266,7 @@ void foo (unsigned _BitInt(255) a, unsig
   _137 = REALPART_EXPR <_135>;
   _138 = _51 | _137;
   bitint.6[_49] = _138;
-  _140 = _49 + 1;
-  _141 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_140];
-  if (_140 <= 3)
+  if (_179 <= 3)
     goto <bb 8>; [0.00%]
   else
     goto <bb 10>; [100.00%]
@@ -172,18 +278,16 @@ void foo (unsigned _BitInt(255) a, unsig
   _9 = REALPART_EXPR <_11>;
   _6 = _9 | _51;
   bitint.6[_49] = _6;
-  _4 = _49 + 1;
-  _3 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_4];
-  if (_4 == 3)
+  _3 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_179];
+  if (_179 == 3)
     goto <bb 9>; [20.00%]
   else
     goto <bb 8>; [80.00%]

   <bb 8> [local count: 1073741824]:
   # _143 = PHI <_10(7), _136(6)>
-  # _145 = PHI <_4(7), _140(6)>
-  # _147 = PHI <_3(7), _141(6)>
-  _63 = VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_145];
+  # _147 = PHI <_3(7), _134(6)>
+  _63 = VIEW_CONVERT_EXPR<unsigned long[4]>(a)[_179];
   goto <bb 10>; [100.00%]

   <bb 9> [local count: 214748360]:
@@ -200,27 +304,21 @@ void foo (unsigned _BitInt(255) a, unsig
   <bb 10> [local count: 858993464]:
   # _67 = PHI <_63(8), 0(6)>
   # _144 = PHI <_143(8), _136(6)>
-  # _146 = PHI <_145(8), _140(6)>
-  # _148 = PHI <_147(8), _141(6)>
-  _68 = VIEW_CONVERT_EXPR<unsigned long[5]>(b)[_146];
-  _69 = .USUBC (_67, _68, _144);
+  # _148 = PHI <_147(8), _134(6)>
+  _69 = .USUBC (_67, _134, _144);
   _56 = IMAGPART_EXPR <_69>;
   _70 = REALPART_EXPR <_69>;
   _71 = _70 | _148;
-  bitint.6[_146] = _71;
+  bitint.6[_179] = _71;
   _50 = _49 + 2;
   if (_50 != 4)
-    goto <bb 28>; [0.06%]
+    goto <bb 5>; [0.06%]
   else
     goto <bb 11>; [99.94%]

Now, the bug is IMHO in using _134 in the .USUBC call, _134 is the
VIEW_CONVERT_EXPR<unsigned long[5]>(b)[4] value,
so something that is valid when bb 10 is entered from the edge from bb 6, but
when entered from bb 8 it ought
to be VIEW_CONVERT_EXPR<unsigned long[4]>(b)[_179], so _3, so the right thing
would be to use there _148
which contains that value.
Later on we completely unroll the loop and there it is clearly visible, for the
least significant limb
we correctly compute bitint.6[0] = b[0] | (a[0] - b[0]), while for the second
least significant limb
bitint.6[1] = b[1] | (a[1] - (b[4] & 1) - carry) and so while bitint.6[0] is
-1, bitint.6[1] is 1.

[Bug tree-optimization/113774] wrong code with _BitInt() arithmetics at -O2

Reply via email to