On 2021/6/25 16:54, Richard Biener wrote:
On Fri, Jun 25, 2021 at 10:34 AM Xionghu Luo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
From: Xiong Hu Luo <luo...@linux.ibm.com>
adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
on Power. For example, it generates mismatched address offset after
adjust iv update statement position:
<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
_34 = ref_180 + 18446744073709551615;
_86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
if (_84 == _86)
goto <bb 56>; [94.50%]
else
goto <bb 87>; [5.50%]
Disable it will produce:
<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
_86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
if (_84 == _86)
goto <bb 56>; [94.50%]
else
goto <bb 87>; [5.50%]
Then later pass loop unroll could benefit from same address offset
with different base address and reduces register dependency.
This patch could improve performance by 10% for typical case on Power,
no performance change observed for X86 or Aarch64 due to small loops
not unrolled on these platforms. Any comments?
The case you quote is special in that if we hoisted the IV update before
the other MEM _also_ used in the condition it would be fine again.
Thanks. I tried to hoist the IV update statement before the first MEM (Fix 2),
it
shows even worse performance due to not unroll(two more "base-1" is generated
in gimple,
then loop->ninsns is 11 so small loops is not unrolled), change the threshold
from
10 to 12 in rs6000_loop_unroll_adjust would make it also unroll 2 times, the
performance is SAME to the one that IV update statement in the *MIDDLE* (trunk).
From the ASM, we can see the index register %r4 is used in two iterations which
maybe a bottle neck for hiding instruction latency?
Then it seems reasonable the performance would be better if keep the IV update
statement at *LAST* (Fix 1).
(Fix 2):
<bb 32> [local count: 70988443]:
ivtmp.30_415 = ivtmp.30_414 + 1;
_34 = ip_229 + 18446744073709551615;
_84 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
_33 = ref_180 + 18446744073709551615;
_86 = MEM[(uint8_t *)_33 + ivtmp.30_415 * 1];
if (_84 == _86)
goto <bb 56>; [94.50%]
else
goto <bb 87>; [5.50%]
.L67:
lbzx %r12,%r24,%r4
lbzx %r25,%r7,%r4
cmpw %cr0,%r12,%r25
bne %cr0,.L11
mr %r26,%r4
addi %r4,%r4,1
lbzx %r12,%r24,%r4
lbzx %r25,%r7,%r4
mr %r6,%r26
cmpw %cr0,%r12,%r25
bne %cr0,.L11
mr %r26,%r4
.L12:
cmpdi %cr0,%r10,1
addi %r4,%r26,1
mr %r6,%r26
addi %r10,%r10,-1
bne %cr0,.L67
Now, adjust_iv_update_pos doesn't seem to check that the
condition actually uses the IV use stmt def, so it likely applies to
too many cases.
Unfortunately the introducing rev didn't come with a testcase,
but still I think fixing up adjust_iv_update_pos is better than
introducing a way to short-cut it per target decision.
One "fix" might be to add a check that either the condition
lhs or rhs is the def of the IV use and the other operand
is invariant. Or if it's of similar structure hoist across the
other iv-use as well. Not that I understand the argument
about the overlapping life-range.
You also don't provide a complete testcase ...
Attached the test code, will also add it it patch in future version.
The issue comes from a very small hot loop:
do {
len++;
} while(len < maxlen && ip[len] == ref[len]);
--
Thanks,
Xionghu
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
# define HLOG 16
#define MAX_LIT (1 << 5)
typedef const uint8_t *LZF_HSLOT;
typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];
int compute_on_bytes(uint8_t *, int, uint8_t *, int );
int main(int argc, char **argv)
{
//Declarations
FILE *fptr;
int len_limit;
uint8_t *inputbuf,*outputbuf;
uint8_t *ip,*op;
int len, i;
long int sum=0;
//randomness
if (argv[1] != NULL )
len=atoi(argv[1]);
//read
fptr = fopen(argv[2],"rb");
if(fptr == NULL)
{
printf("Error!");
exit(1);
}
fseek( fptr , 0L , SEEK_END);
len_limit = ftell( fptr );
rewind( fptr );
/* allocate memory for entire input content */
inputbuf = calloc( 1, len_limit+1 );
if( !inputbuf ) fclose(fptr),fputs("memory alloc fails",stderr),exit(1);
/* allocate memory for entire output */
outputbuf = calloc( 1, len_limit+1 );
if( !inputbuf ) fclose(fptr),fputs("memory alloc fails",stderr),exit(1);
/* copy the file into the buffer */
if( 1!=fread( inputbuf , len_limit, 1 , fptr) )
fclose(fptr),free(inputbuf),fputs("entire read fails",stderr),exit(1);
//compare
ip=inputbuf;
op=outputbuf;
for (i=0; i < len_limit; i=i+(len/4))
sum+=compute_on_bytes(ip+i,len,op,len-4);
fclose(fptr);
free(inputbuf);
return sum;
}
//__attribute__((noinline)) int compute_on_bytes(uint8_t *ip, uint8_t *ref,
int len_limit, int temp2 )
int compute_on_bytes(uint8_t *in_data, int in_len, uint8_t *out_data, int
out_len)
{
LZF_STATE htab;
uint8_t *ip = in_data;
uint8_t *op = out_data;
uint8_t *in_end = ip + in_len;
uint8_t *out_end = op + out_len;
uint8_t *ref;
unsigned long off;
unsigned int hval;
int lit;
if (!in_len || !out_len)
return 0;
lit = 0; op++;
hval = (((ip[0]) << 8) | ip[1]);
while( ip < in_end - 2 )
{
uint8_t *hslot;
hval = (((hval) << 8) | ip[2]);
hslot = htab + ((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16))
- 1));
ref = *hslot + in_data;
*hslot = ip - in_data;
if (1 && (off = ip - ref - 1) < (1 << 13) && ref > in_data &&
ref[2] == ip[2] && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0]) )
{
unsigned int len = 2;
unsigned int maxlen = in_end - ip - len;
maxlen = maxlen > ((1 << 8) + (1 << 3)) ? ((1 << 8) +
(1 << 3)) : maxlen;
if ((op + 3 + 1 >= out_end) != 0)
if (op - !lit + 3 + 1 >= out_end)
return 0;
op [- lit - 1] = lit - 1;
op -= !lit;
for (;;)
{
if ( maxlen > 16 )
{
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
}
do {
len++;
}while(len < maxlen && ip[len] == ref[len]);
break;
}
len -= 2;
ip++;
if (len < 7)
{
*op++ = (off >> 8) + (len << 5);
}
else
{
*op++ = (off >> 8) + ( 7 << 5);
*op++ = len - 7;
}
*op++ = off;
lit = 0; op++;
ip += len + 1;
if (ip >= in_end - 2)
break;
--ip;
--ip;
hval = (((ip[0]) << 8) | ip[1]);
hval = (((hval) << 8) | ip[2]);
htab[((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16)) -
1))] = ip - in_data;
ip++;
hval = (((hval) << 8) | ip[2]);
htab[((( hval >> (3*8 - 16)) - hval*5) & ((1 << (16)) -
1))] = ip - in_data;
ip++;
}
else {
if(op >= out_end)
return 0;
lit++; *op++ = *ip++;
if (lit == (1 << 5))
{
op [- lit - 1] = lit - 1;
lit = 0; op++;
}
}
}
if (op + 3 > out_end) /* at most 3 bytes can be missing here */
return 0;
while (ip < in_end)
{
lit++; *op++ = *ip++;
if (lit == MAX_LIT)
{
op [- lit - 1] = lit - 1; /* stop run */
lit = 0; op++; /* start run */
}
}
op [- lit - 1] = lit - 1; /* end run */
op -= !lit; /* undo run if length is zero */
return op - out_data;
}