[Bug tree-optimization/45972] [4.6 Regression] tree check fail in use_pred_not_overlap_with_undef_path_pred

2010-10-12 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45972

davidxl  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution||FIXED

--- Comment #3 from davidxl  2010-10-12 22:34:20 
UTC ---
in r165402.


[Bug target/46200] [4.6 Regression] optimization regression in simple pointer loop

2010-10-28 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46200

davidxl  changed:

   What|Removed |Added

 Status|NEW |ASSIGNED
 CC||davidxl at gcc dot gnu.org

--- Comment #5 from davidxl  2010-10-28 19:01:16 
UTC ---
Confirmed.

The problem seems to be in the cost computation for loop exit tests -- the cost
associated with iv update seem to be double counted (already considered as iv
cost, but included again in testing cost). If this is the root cause, it is
there since day-1, but exposed by the ivopt enhancement patch.

David


[Bug rtl-optimization/46235] New: inefficient bittest code generation

2010-10-29 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46235

   Summary: inefficient bittest code generation
   Product: gcc
   Version: 4.6.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: rtl-optimization
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: davi...@gcc.gnu.org
CC: xinlian...@gmail.com


Test case:

int foo(int a, int x, int y)
{
   if  (a & (1 << x)) 
   return a;
   return 1;
}

Trunk gcc generates:


foo:
.LFB0:
.cfi_startproc
movl%edi, %eax
movl%edi, %edx
movl%esi, %ecx
sarl%cl, %edx
andl$1, %edx
movl$1, %edx
cmove%edx, %eax
ret


Trunk llvm (with clang) generates:

foo:
.Leh_func_begin0:
btl%esi, %edi
movl$1, %eax
cmovbl%edi, %eax
ret


[Bug tree-optimization/46236] New: Local aggregate not eliminated

2010-10-29 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46236

   Summary: Local aggregate not eliminated
   Product: gcc
   Version: 4.6.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: davi...@gcc.gnu.org
CC: xinlian...@gmail.com


Simple Test case:

struct A {
  int a[100];
};

const struct A aa = {1,1,1};

int foo(int i)
{
int s = 0;
struct A a; 

a = aa;
s = a.a[i];
if (i > 5)
   s+=a.a[i];

   return s;
}

// Trunk gcc generates: (O2)

foo:
.LFB0:
.cfi_startproc
subq$280, %rsp
.cfi_def_cfa_offset 288
movl%edi, %edx
xorl%eax, %eax
leaq-120(%rsp), %rdi
cmpl$6, %edx
movl$50, %ecx
rep stosq
movl$1, -120(%rsp)
movl$1, -116(%rsp)
movl$1, -112(%rsp)
movslq%edx, %rax
movl-120(%rsp,%rax,4), %eax
leal(%rax,%rax), %ecx
cmovge%ecx, %eax
addq$280, %rsp
.cfi_def_cfa_offset 8
ret


// Trunk LLVM generates: (O2)
foo:
.Leh_func_begin0:
movslq%edi, %rcx
movlaa(,%rcx,4), %eax
cmpl$6, %ecx
jl.LBB0_2
addl%eax, %eax
.LBB0_2:
ret


[Bug rtl-optimization/46279] New: cmov not hoisted out of the loop

2010-11-02 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46279

   Summary: cmov not hoisted out of the loop
   Product: gcc
   Version: 4.6.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: rtl-optimization
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: davi...@gcc.gnu.org


Simple test case:

extern int gen_int(int);
extern void ref_int_p(int*);

void kernel3 ()
{
  int i;
  int j;
  int k;
  int l;
  int m;
  int a[200];

  j = gen_int (0);
  k = gen_int (0);

  for (i = 0; i < 200; i++)
{
  if (j < k)
a[i] = 1;
  else
a[i] = j;
}

  ref_int_p (&a[0]);

  return;
}


Code generated by trunk gcc at O2:

kernel3:
.LFB0:
.cfi_startproc
pushq%rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl%edi, %edi
subq$800, %rsp
.cfi_def_cfa_offset 816
callgen_int
xorl%edi, %edi
movl%eax, %ebx
callgen_int
movq%rsp, %rdx
leaq800(%rsp), %rdi
movl$1, %esi
.p2align 4,,10
.p2align 3
.L4:
cmpl%eax, %ebx
movl%esi, %ecx
cmovge%ebx, %ecx
movl%ecx, (%rdx)
addq$4, %rdx
cmpq%rdi, %rdx
jne.L4
movq%rsp, %rdi
callref_int_p
addq$800, %rsp
.cfi_def_cfa_offset 16
popq%rbx
.cfi_def_cfa_offset 8
ret

The loop header is L4.


LLVM generates:

.Leh_func_begin0:
pushq%rbx
.Ltmp0:
subq$800, %rsp
.Ltmp1:
xorl%edi, %edi
callqgen_int
movl%eax, %ebx
xorl%edi, %edi
callqgen_int
cmpl%eax, %ebx
movl$1, %eax
cmovgel%ebx, %eax
xorl%ecx, %ecx
.align16, 0x90
.LBB0_1:
movl%eax, (%rsp,%rcx,4)
incq%rcx
cmpq$200, %rcx
jne.LBB0_1
leaq(%rsp), %rdi
callqref_int_p
addq$800, %rsp
popq%rbx

The loop (LBB0_1) is much tighter.

David


[Bug tree-optimization/46281] New: Inefficient unswitching (too many copies)

2010-11-02 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46281

   Summary: Inefficient unswitching (too many copies)
   Product: gcc
   Version: 4.6.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: davi...@gcc.gnu.org


Compiling the following program with -O3:

extern int gen_int(int);
extern void ref_int_p(int*);

void kernel3 ()
{
  int i;
  int j;
  int k;
  int l;
  int m;
  int a[200];

  j = gen_int (0);
  k = gen_int (0);
  l = gen_int (0);
  m = gen_int (0);

  for (i = 0; i < 200; i++)
{
  if (j < k || j < l || j < m ) // || j << 3 || k << 4)
a[i] = 1;
  else
a[i] -= j;
}

  ref_int_p (&a[0]);

  return;
}


Gcc unswitches the loop, but generate three copies of the loop -- it should
only generate 2 copies.

LLVM correctly generates two copies.

David


[Bug rtl-optimization/46265] Missing ifcvt

2010-11-02 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46265

davidxl  changed:

   What|Removed |Added

 CC||davidxl at gcc dot gnu.org

--- Comment #2 from davidxl  2010-11-03 05:24:54 
UTC ---
The following is another example gcc fails to ifcvt, while llvm performs ifcvt
+ hoist the conditional assignment out of the loop.

 2 extern int gen_int(int);
  3 extern void ref_int_p(int*);
  4 
  5 void kernel3 ()
  6 {
  7   int i;
  8   int j;
  9   int k;
 10   int l;
 11   int m;
 12   int a[200];
 13 
 14   j = gen_int (0);
 15   k = gen_int (0);
 16   l = gen_int (0);
 17   m = gen_int (0);
 18 
 19   for (i = 0; i < 200; i++)
 20 {
 21   if (j < k || j < l || j < m ) 
 22 a[i] = 1;
 23   else
 24 a[i] = j;
 25 }
 26 
 27   ref_int_p (&a[0]);
 28 
 29   return;
 30 }


[Bug rtl-optimization/46265] Missing ifcvt

2010-11-02 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46265

--- Comment #3 from davidxl  2010-11-03 05:59:30 
UTC ---
Another example gcc fails to ifcvt (succeeds only if only one statement is in
if and else block.

void ref_int_p(int *); 

void foo (int j, int k)
{
  int i;
  int a[200], b[100];
  i = 0;
  for ( ; i < 100; i++)
  {
if (j < k)
  {   
a[i] = j;
b[i] = j;
  }   
  else
  {   
a[i] = 1;
b[i] = 1;
  }   
 }
  ref_int_p (&a[0]);
  ref_int_p (&b[0]);

  return;
}

It (the loop) should be generated like:

LBB0_1:
cmpl%esi, %edi
movl$1, %ecx
cmovll  %edi, %ecx
movl%ecx, -800(%rbp,%rax,4)
movl%ecx, -1200(%rbp,%rax,4)
incq%rax
cmpq$100, %rax
jne .LBB0_1

David


[Bug target/46200] [4.6 Regression] optimization regression in simple pointer loop

2010-11-03 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46200

davidxl  changed:

   What|Removed |Added

 Status|ASSIGNED|RESOLVED
 Resolution||FIXED

--- Comment #6 from davidxl  2010-11-03 22:40:56 
UTC ---
Fix in r166280.


[Bug tree-optimization/46306] New: inefficient code generated for array accesses

2010-11-04 Thread davidxl at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46306

   Summary: inefficient code generated for array accesses
   Product: gcc
   Version: 4.6.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: davi...@gcc.gnu.org


//Example:

int foo (int i, int *p, int t)
{
int p2 = p[i];
int temp = 0;
int temp2 = 1;
int temp3 = 4;
if (p[i+1] > t)
 {
   temp = p2;
   temp2 = p2 + 2;
   temp3 = p2 + 3;
 }
return p[temp] + p [temp2] + p[temp3];
}

Two problems seen the code generated by trunk gcc at -O2

1) all the shift operation are redundant and should be folded as the stride in
the memory operand
2) unnecessary code duplication 

(may be handled by a pass that converts memory access with linear address into
target memref in straight line code)

foo:
.LFB0:
.cfi_startproc
movslq%edi, %rdi
movl(%rsi,%rdi,4), %eax
cmpl%edx, 4(%rsi,%rdi,4)
jle.L3
movslq%eax, %rdi
leal2(%rax), %ecx
salq$2, %rdi
leal3(%rax), %edx
movslq%ecx, %rcx
movl(%rsi,%rdi), %eax
salq$2, %rcx
movslq%edx, %rdx
addl(%rsi,%rcx), %eax
salq$2, %rdx
addl(%rsi,%rdx), %eax
ret
.p2align 4,,10
.p2align 3
.L3:
movl$16, %edx
movl$4, %ecx
xorl%edi, %edi
movl(%rsi,%rdi), %eax
addl(%rsi,%rcx), %eax
addl(%rsi,%rdx), %eax
ret


// The following code is generated by another compiler -- not ideal, but
better:
foo:
.Leh_func_begin0:
pushq%rbp
.Ltmp0:
movq%rsp, %rbp
.Ltmp1:
movslq%edi, %rax
leal1(%rax), %ecx
movslq%ecx, %rcx
cmpl%edx, (%rsi,%rcx,4)
jg.LBB0_2
movl$1, %eax
xorl%ecx, %ecx
movl$4, %edx
jmp.LBB0_3
.LBB0_2:
movslq(%rsi,%rax,4), %rcx
leal3(%rcx), %eax
movslq%eax, %rdx
leal2(%rcx), %eax
movslq%eax, %rax
.LBB0_3:
movl(%rsi,%rax,4), %eax
addl(%rsi,%rcx,4), %eax
addl(%rsi,%rdx,4), %eax
popq%rbp
ret