[Bug middle-end/67438] [6 Regression] ~X op ~Y pattern relocation causes loop performance degradation on 32bit x86

miyuki at gcc dot gnu.org Thu, 03 Sep 2015 11:01:59 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67438


--- Comment #6 from Mikhail Maltsev <miyuki at gcc dot gnu.org> ---
(In reply to rguent...@suse.de from comment #5)
> For the case above, why is a_14 = ~_13 not sunk to the edge
> 3->4 and b_18 = ~_17 to the edge 3->5?  (yes, this creates
> additional BBs)  This would reduce register pressure.

I think, because a_14 and b_18 are used in the next bb. Actually I wrote only
part of bb6. The full dump looks like this:

  <bb 6>:
  # d_4 = PHI <d_23(4), d_24(5)>
  out_26 = out_3 + 1;
  *out_3 = a_14;
  out_29 = &MEM[(void *)out_3 + 2B];
  MEM[(char *)out_3 + 1B] = b_18;
  out_32 = &MEM[(void *)out_3 + 3B];
  MEM[(char *)out_3 + 2B] = c_22;
  out_35 = &MEM[(void *)out_3 + 4B];
  MEM[(char *)out_3 + 3B] = d_4;

  <bb 7>:
  # n_1 = PHI <n_6(D)(2), n_10(6)>
  # in_2 = PHI <in_7(D)(2), in_20(6)>
  # out_3 = PHI <out_8(D)(2), out_35(6)>
  n_10 = n_1 + -1;
  if (n_10 != 0)
    goto <bb 3>;
  else
    goto <bb 8>;

  <bb 8>:
  return;


> Maybe this kind of scheduling can be considered when register pressure
> is high (does -fsched-pressure -fschedule-insns help?)

Not much. With -fsched-pressure -fschedule-insns we generate 2 insns less:

.L7:
        movzbl  0(%ebp), %edi   # MEM[base: in_70, offset: 0B], D.1940
        addl    $3, %ebp        #, in
        movzbl  -2(%ebp), %esi  # MEM[base: in_70, offset: 1B], D.1940
        movl    %edi, %eax      # D.1940, a
        movzbl  -1(%ebp), %edx  # MEM[base: in_30, offset: 4294967295B],
MEM[base: in_30, offset: 4294967295B]
        notl    %eax    # a
        movb    %al, (%ebx)     # a, MEM[base: out_71, offset: 0B]
        movl    %esi, %ecx      # D.1940, b
        notl    %ecx    # b
        movb    %cl, 1(%ebx)    # b, MEM[base: out_71, offset: 1B]
        notl    %edx    # c
        movb    %dl, 2(%ebx)    # c, MEM[base: out_71, offset: 2B]
        cmpb    %dl, %al        # c, a
        cmovg   %edx, %eax      # d,, c, d
        cmpb    %dl, %cl        # c, b
        movb    %al, 4(%esp)    # tmp277, %sfp
        cmovle  %ecx, %edx      # b,, d
        movl    %esi, %eax      # D.1940, D.1940
        movl    %edi, %ecx      # D.1940, D.1940
        addl    $4, %ebx        #, out
        cmpb    %al, %cl        # D.1940, D.1940
        movzbl  4(%esp), %eax   # %sfp, d
        cmovg   %eax, %edx      # d,, d
        cmpl    8(%esp), %ebp   # %sfp, in
        movb    %dl, -1(%ebx)   # d, MEM[base: out_11, offset: 4294967295B]
        jne     .L7     #,

I wonder, whether a transformation like this could help:

bb1:
  x = min(a, c)
  goto bb3
bb2:
  y = min(b, c)
  goto bb3
bb3:
  z = phi(x, y) // x and y are single-use

--->

bb1:
  x = a
  goto bb3
bb2:
  y = b
  goto bb3
bb3:
  z' = phi(x, y)
  z = min(z', c)

Though if we don't simplify phi(x, y), we would increase register pressure even
more.

[Bug middle-end/67438] [6 Regression] ~X op ~Y pattern relocation causes loop performance degradation on 32bit x86

Reply via email to