C2) into ROTATE

Kyrylo Tkachov Mon, 04 Nov 2024 07:04:06 -0800


> On 4 Nov 2024, at 15:20, Jakub Jelinek <ja...@redhat.com> wrote:
> 
> On Mon, Nov 04, 2024 at 02:31:29PM +0100, Jakub Jelinek wrote:
>> On Mon, Nov 04, 2024 at 01:07:33PM +0000, Kyrylo Tkachov wrote:
>>>> This seems to have broken bootstrap on multiple targets and is causing
>>>> the arm CI to hang, miscompiling stage2 - can
>>>> you please revert for now?
>>> 
>>> Sorry for this, I’ve pushed a revert.
>>> I did do a clean bootstrap on aarch64-linux this morning before pushing, 
>>> but should have looked wider.
>> 
>> On x86_64-linux, the hang is during var-tracking is while compiling 
>> libiberty/sha1.c
>> with -g -O2.
>> I'm now trying to cvise reduce it.
> 
> Here it is somewhat reduced:
> time ./cc1.r15-4872 -quiet -g -O2 sha1.i
> 
> real 0m0.074s
> user 0m0.062s
> sys 0m0.012s
> time ./cc1.r15-4873 -quiet -g -O2 sha1.i
> 
> real 0m10.740s
> user 0m10.716s
> sys 0m0.012s
> 
> From what I can see, -da -fdump-noaddr dumps with it are identical,
> and so is the emitted assembly, it is just that var-tracking spends
> those extra 10+ seconds on it.
> 
> struct S { unsigned A, B, C, D, E, F; };
> void *spbb;
> long spbl;
> struct S spbc;
> void
> sha1_process_block (unsigned *endp)
> {
>  unsigned *words = spbb;
>  long nwords = spbl / sizeof 0;
>  unsigned x[16];
>  unsigned a = spbc.A;
>  unsigned b = spbc.B;
>  unsigned c = spbc.C;
>  unsigned d = spbc.D;
>  unsigned e = spbc.E;
>  while (words < endp)
>    {
>      unsigned tm;
>      int t;
>      t = 0;
>      for (; t < 16; t++)
> {
>  x[t] = ((*words) >> 24);
>  words++;
> }
>      tm = x[2] ^ (int) 0 ^ (int) (8) ^ x[15];
>      b += ((x[2] = ((tm << 1) | (tm >> 31))));
>      tm = x[4] ^ x[0] ^ x[0] ^ x[0];
>      e += ((x[4] = tm));
>      tm = x[7] ^ x[0] ^ x[0] ^ x[0];
>      b += ((x[7] = tm));
>      tm = x[9] ^ x[0] ^ x[0] ^ x[0];
>      e += ((x[9] = tm));
>      tm = x[10] ^ (int) 0 ^ (int) (8) ^ x[7];
>      d += ((x[10] = ((tm << 1) | (tm >> 31))));
>      tm = x[11] ^ x[13] ^ x[3] ^ x[8];
>      c += 0x6ed9eba1 + ((x[11] = ((tm << 1) | (tm >> 31))));
>      tm = x[12] ^ x[14] ^ 4U ^ x[9];
>      b += 0x6ed9eba1 + ((x[12] = ((tm << 1) | (tm >> 31))));
>      a += (((b) >> 0)) + (c ^ d ^ e) + 0x6ed9eba1 + (tm = x[13] ^ 10U, (x[13] 
> = ((tm << 0) | (tm >> 31))));
>      tm = x[15] ^ x[1] ^ x[12];
>      d += +0x6ed9eba1 + ((x[15] = ((tm << 1) | (tm >> 31))));
>      tm = 0U ^ x[2] ^ x[8] ^ x[13];
>      c += 0x6ed9eba1 + ((x[0] = ((tm << 1) | (tm >> 31))));
>      tm = 2U ^ x[4] ^ x[0] ^ x[0];
>      a += 0x6ed9eba1 + ((x[2] = tm));
>      tm = x[3] ^ x[0] ^ x[0] ^ x[0];
>      e += 0x6ed9eba1 + ((x[3] = tm));
>      tm = x[8] ^ x[10] ^ x[0] ^ x[5];
>      e += 0x8f1bbcdc + ((x[8] = ((tm << 1) | (tm >> 31))));
>      tm = x[11] ^ x[0] ^ x[8];
>      b += ((a & (d | e))) + 0x8f1bbcdc + ((x[11] = ((tm << 1) | (tm >> 31))));
>      tm = x[12] ^ x[0] ^ x[0] ^ x[0];
>      a += 0x8f1bbcdc + ((x[12] = tm));
>      tm = x[13] ^ x[15] ^ x[0] ^ x[10];
>      e += 0x8f1bbcdc + ((x[13] = ((tm << 1) | (tm >> 31))));
>      b += 0x8f1bbcdc + ((x[0] = 0));
>      tm = x[5] ^ x[7] ^ x[0];
>      b += ((a & (d | e))) + 0x8f1bbcdc + ((x[5] = ((tm >> 31))));
>      tm = x[6];
>      a += 0x8f1bbcdc + ((x[6] = ((tm >> 31))));
>      tm = x[8];
>      d += 0x8f1bbcdc + ((x[8] = ((tm >> 31))));
>      tm = x[10] ^ x[12];
>      b += 0x8f1bbcdc + ((x[10] = tm));
>      tm = x[11] ^ x[13] ^ x[3] ^ x[0];
>      a += 0x8f1bbcdc + ((x[11] = ((tm >> 31))));
>      tm = x[13] ^ x[15];
>      d += +(a ^ b ^ c) + 0xca62c1d6 + ((x[13] = tm));
>      tm = x[15];
>      b += (d ^ e ^ a) + 0xca62c1d6 + ((x[15] = ((tm >> 31))));
>      d = spbc.D += d;
>    }
> }
>


Thanks again for the help, I’ve found the inefficiency in my patch. I’m calling 
simplify_rtx on the two arms of the rotate needlessly.
I’ll test and post an updated patch.
Kyrill

> 
> Jakub
>

Re: [PATCH 1/6] PR 117048: simplify-rtx: Simplify (X << C1) [+, ^] (X >> C2) into ROTATE

Reply via email to